Skip to content

Commit 27efeb9

Browse files
committed
pml/ob1: fix double increment of the RDMA frag retry counter
If a put or get operation fails it may later be retried by mca_pml_ob1_process_pending_rdma which increments retries on each new attempt. There is a flaw in the code where both the put and get failures also increment this counter leading to it giving up twice as fast. This commit removes the increments on the put and get failures. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 020e83f commit 27efeb9

File tree

2 files changed

+20
-19
lines changed

2 files changed

+20
-19
lines changed

ompi/mca/pml/ob1/pml_ob1_recvreq.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *fr
382382
}
383383
}
384384

385-
if (++frag->retries < mca_pml_ob1.rdma_retries_limit &&
385+
if (frag->retries < mca_pml_ob1.rdma_retries_limit &&
386386
OMPI_ERR_OUT_OF_RESOURCE == rc) {
387387
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
388388
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);

ompi/mca/pml/ob1/pml_ob1_sendreq.c

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,30 +1275,31 @@ static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *f
12751275
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
12761276
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
12771277

1278-
if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
1278+
if (frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
12791279
/* queue the frag for later if there was a resource error */
12801280
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
12811281
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
12821282
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
1283-
} else {
1283+
return;
1284+
}
1285+
12841286
#if OPAL_ENABLE_FT
1285-
if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) {
1286-
return;
1287-
}
1288-
#endif /* OPAL_ENABLE_FT */
1289-
/* tell receiver to deregister memory */
1290-
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
1291-
frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
1292-
OPAL_ERR_TEMP_OUT_OF_RESOURCE);
1293-
1294-
/* send fragment by copy in/out */
1295-
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
1296-
frag->rdma_length);
1297-
/* if a pointer to a receive request is not set it means that
1298-
* ACK was not yet received. Don't schedule sends before ACK */
1299-
if (NULL != sendreq->req_recv.pval)
1300-
mca_pml_ob1_send_request_schedule (sendreq);
1287+
if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) {
1288+
return;
13011289
}
1290+
#endif /* OPAL_ENABLE_FT */
1291+
/* tell receiver to deregister memory */
1292+
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
1293+
frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
1294+
OPAL_ERR_TEMP_OUT_OF_RESOURCE);
1295+
1296+
/* send fragment by copy in/out */
1297+
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
1298+
frag->rdma_length);
1299+
/* if a pointer to a receive request is not set it means that
1300+
* ACK was not yet received. Don't schedule sends before ACK */
1301+
if (NULL != sendreq->req_recv.pval)
1302+
mca_pml_ob1_send_request_schedule (sendreq);
13021303
}
13031304

13041305
/**

0 commit comments

Comments
 (0)