Skip to content

Commit 0263456

Browse files
author
Thananon Patinyasakdikul
committed
pml/ob1: fix deadlock with communicator flag ALLOW_OVERTAKE.
We missed an assert to check if ALLOW_OVERTAKE is set or not before validating the sequence number and this will cause deadlock. Signed-off-by: Thananon Patinyasakdikul <[email protected]>
1 parent d1fd1f4 commit 0263456

File tree

1 file changed

+18
-12
lines changed

1 file changed

+18
-12
lines changed

ompi/mca/pml/ob1/pml_ob1_recvfrag.c

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2018 The University of Tennessee and The University
6+
* Copyright (c) 2004-2019 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
@@ -963,19 +963,21 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl,
963963
frag_msg_seq = hdr->hdr_seq;
964964
next_msg_seq_expected = (uint16_t)proc->expected_sequence;
965965

966-
/* If the sequence number is wrong, queue it up for later. */
967-
if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected)) {
968-
mca_pml_ob1_recv_frag_t* frag;
969-
MCA_PML_OB1_RECV_FRAG_ALLOC(frag);
970-
MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl);
971-
append_frag_to_ordered_list(&proc->frags_cant_match, frag, next_msg_seq_expected);
966+
if (!OMPI_COMM_CHECK_ASSERT_ALLOW_OVERTAKE(comm_ptr)) {
967+
/* If the sequence number is wrong, queue it up for later. */
968+
if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected)) {
969+
mca_pml_ob1_recv_frag_t* frag;
970+
MCA_PML_OB1_RECV_FRAG_ALLOC(frag);
971+
MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl);
972+
append_frag_to_ordered_list(&proc->frags_cant_match, frag, next_msg_seq_expected);
972973

973-
SPC_RECORD(OMPI_SPC_OUT_OF_SEQUENCE, 1);
974-
SPC_RECORD(OMPI_SPC_OOS_IN_QUEUE, 1);
975-
SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE);
974+
SPC_RECORD(OMPI_SPC_OUT_OF_SEQUENCE, 1);
975+
SPC_RECORD(OMPI_SPC_OOS_IN_QUEUE, 1);
976+
SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE);
976977

977-
OB1_MATCHING_UNLOCK(&comm->matching_lock);
978-
return OMPI_SUCCESS;
978+
OB1_MATCHING_UNLOCK(&comm->matching_lock);
979+
return OMPI_SUCCESS;
980+
}
979981
}
980982

981983
/* mca_pml_ob1_recv_frag_match_proc() will release the lock. */
@@ -1011,6 +1013,10 @@ mca_pml_ob1_recv_frag_match_proc( mca_btl_base_module_t *btl,
10111013

10121014
match_this_frag:
10131015
/* We're now expecting the next sequence number. */
1016+
/* NOTE: We should have checked for ALLOW_OVERTAKE comm flag here
1017+
* but adding a branch in this critical path is not ideal for performance.
1018+
* We decided to let it run the sequence number even we are not doing
1019+
* anything with it. */
10141020
proc->expected_sequence++;
10151021

10161022
/* We generate the SEARCH_POSTED_QUEUE only when the message is

0 commit comments

Comments
 (0)