Skip to content

Commit f99cbcd

Browse files
committed
btl/uct: fix a race condition when setting up endpoints
This commit fixes a regression I introduced when changing how the endpoint flags are set. There is no guarantee when the remote completion message comes in that the local endpoint has been connected (only the remote endpoint). Instead of always setting the endpoint as ready here the code now only sets that the remote is connected and then sets the endpoint as ready only if both the local and remote endpoints are connected. When the local endpoint is connected it similarly checks that the remote is connected before setting the endpoint as ready. This was verified using RMA-MT with 128 threads and it does fix the race. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent a477b22 commit f99cbcd

File tree

4 files changed

+34
-17
lines changed

4 files changed

+34
-17
lines changed

opal/mca/btl/uct/btl_uct_endpoint.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,9 @@ static int mca_btl_uct_endpoint_connect_endpoint(
368368
if (UCS_OK != ucs_status) {
369369
return OPAL_ERROR;
370370
}
371+
372+
mca_btl_uct_endpoint_set_flag(uct_btl, endpoint, tl_context->context_id, tl_endpoint,
373+
MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED);
371374
}
372375

373376
opal_timer_t now = opal_timer_base_get_usec();

opal/mca/btl/uct/btl_uct_endpoint.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* reserved.
1515
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
1616
* All Rights reserved.
17+
* Copyright (c) 2025 Google, LLC. All rights reserved.
1718
* $COPYRIGHT$
1819
*
1920
* Additional copyrights may follow
@@ -106,5 +107,25 @@ static inline int mca_btl_uct_endpoint_check_am(mca_btl_uct_module_t *module,
106107
module->am_tl->tl_index);
107108
}
108109

110+
// Requires that the endpoint lock is held.
111+
static inline void mca_btl_uct_endpoint_set_flag(mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint,
112+
int context_id, mca_btl_uct_tl_endpoint_t *tl_endpoint, int32_t flag) {
113+
opal_atomic_wmb();
114+
int32_t flag_value = opal_atomic_or_fetch_32(&tl_endpoint->flags, flag);
115+
if ((flag_value & (MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED | MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY)) ==
116+
(MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED | MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY)) {
117+
opal_atomic_fetch_or_32(&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY);
118+
119+
opal_atomic_wmb();
120+
121+
mca_btl_uct_base_frag_t *frag;
122+
OPAL_LIST_FOREACH (frag, &module->pending_frags, mca_btl_uct_base_frag_t) {
123+
if (frag->context->context_id == context_id && endpoint == frag->endpoint) {
124+
frag->ready = true;
125+
}
126+
}
127+
}
128+
}
129+
109130
END_C_DECLS
110131
#endif

opal/mca/btl/uct/btl_uct_tl.c

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* and Technology (RIST). All rights reserved.
77
* Copyright (c) 2018 Triad National Security, LLC. All rights
88
* reserved.
9-
* Copyright (c) 2019 Google, LLC. All rights reserved.
9+
* Copyright (c) 2019-2025 Google, LLC. All rights reserved.
1010
* $COPYRIGHT$
1111
*
1212
* Additional copyrights may follow
@@ -199,9 +199,6 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module,
199199
int32_t ep_flags;
200200
int rc;
201201

202-
BTL_VERBOSE(("got connection request for endpoint %p. type = %d. context id = %d",
203-
(void *) endpoint, req->type, req->context_id));
204-
205202
if (NULL == endpoint) {
206203
BTL_ERROR(("could not create endpoint for connection request"));
207204
return UCS_ERR_UNREACHABLE;
@@ -211,6 +208,9 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module,
211208

212209
ep_flags = opal_atomic_fetch_or_32(&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC);
213210

211+
BTL_VERBOSE(("got connection request for endpoint %p. type = %d. context id = %d. ep_flags = %x",
212+
(void *) endpoint, req->type, req->context_id, ep_flags));
213+
214214
if (!(ep_flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC)) {
215215
/* create any necessary resources */
216216
rc = mca_btl_uct_endpoint_connect(module, endpoint, req->context_id, req->ep_addr,
@@ -225,22 +225,13 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module,
225225
* message. this might be overkill but there is little documentation at the UCT level on when
226226
* an endpoint can be used. */
227227
if (req->type == 1) {
228-
/* remote side is ready */
229-
mca_btl_uct_base_frag_t *frag;
230-
228+
/* remote side is connected */
231229
/* to avoid a race with send adding pending frags grab the lock here */
232230
OPAL_THREAD_SCOPED_LOCK(&endpoint->ep_lock, {
233231
BTL_VERBOSE(("connection ready. sending %" PRIsize_t " frags",
234232
opal_list_get_size(&module->pending_frags)));
235-
(void) opal_atomic_or_fetch_32(&tl_endpoint->flags,
236-
MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY);
237-
opal_atomic_wmb();
238-
239-
OPAL_LIST_FOREACH (frag, &module->pending_frags, mca_btl_uct_base_frag_t) {
240-
if (frag->context->context_id == req->context_id && endpoint == frag->endpoint) {
241-
frag->ready = true;
242-
}
243-
}
233+
mca_btl_uct_endpoint_set_flag(module, endpoint, req->context_id, tl_endpoint,
234+
MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY);
244235
});
245236
}
246237

opal/mca/btl/uct/btl_uct_types.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@ struct mca_btl_uct_base_frag_t;
2727
# define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC 0x1
2828
/** remote endpoint read */
2929
# define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY 0x2
30+
/** local UCT endpoint connected */
31+
# define MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED 0x4
3032
/** connection was established */
31-
# define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY 0x4
33+
# define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY 0x8
3234

3335
/* AM tags */
3436
/** BTL fragment */

0 commit comments

Comments
 (0)