Skip to content

Commit a1544c0

Browse files
authored
Merge pull request #13018 from hjelmn/wip_uct_improvements
btl/uct: reduce number of messages sent when establishing connections
2 parents 2514b6e + 41ad9f7 commit a1544c0

File tree

4 files changed

+73
-33
lines changed

4 files changed

+73
-33
lines changed

opal/mca/btl/uct/btl_uct.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2019 Google, LLC. All rights reserved.
15+
* Copyright (c) 2019-2025 Google, LLC. All rights reserved.
1616
* Copyright (c) 2019 Intel, Inc. All rights reserved.
1717
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
1818
* All Rights reserved.
@@ -40,6 +40,8 @@
4040
#include "opal/mca/mpool/mpool.h"
4141
#include "opal/mca/pmix/pmix-internal.h"
4242
#include "opal/mca/rcache/base/base.h"
43+
#include "opal/mca/threads/condition.h"
44+
#include "opal/mca/threads/mutex.h"
4345
#include "opal/mca/threads/tsd.h"
4446
#include "opal/util/event.h"
4547
#include <uct/api/uct.h>
@@ -153,6 +155,9 @@ struct mca_btl_uct_component_t {
153155

154156
/** disable UCX memory hooks */
155157
bool disable_ucx_memory_hooks;
158+
159+
/** connection retry timeout */
160+
unsigned int connection_retry_timeout;
156161
};
157162
typedef struct mca_btl_uct_component_t mca_btl_uct_component_t;
158163

opal/mca/btl/uct/btl_uct_component.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,17 @@ static int mca_btl_uct_component_register(void)
102102
MCA_BASE_VAR_SCOPE_ALL, &mca_btl_uct_component.bind_threads_to_contexts);
103103
#endif
104104

105+
/* timeout between connection message attempts in µs */
106+
mca_btl_uct_component.connection_retry_timeout = 2000;
107+
(void) mca_base_component_var_register(
108+
&mca_btl_uct_component.super.btl_version, "connection_retry_timeout",
109+
"Timeout between attempts to send connection messages for connect-to-"
110+
"endpoint connections. The timeout is measured in µs and is only"
111+
"necessary when using unreliable transports for connections (ex: UD). "
112+
"(default: 2000µs)",
113+
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_4,
114+
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.connection_retry_timeout);
115+
105116
/* for now we want this component to lose to btl/ugni and btl/vader */
106117
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 1;
107118

opal/mca/btl/uct/btl_uct_endpoint.c

Lines changed: 50 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* reserved.
55
* Copyright (c) 2018 Triad National Security, LLC. All rights
66
* reserved.
7-
* Copyright (c) 2019 Google, LLC. All rights reserved.
7+
* Copyright (c) 2019-2025 Google, LLC. All rights reserved.
88
* $COPYRIGHT$
99
*
1010
* Additional copyrights may follow
@@ -16,6 +16,7 @@
1616
#include "btl_uct.h"
1717
#include "btl_uct_am.h"
1818
#include "btl_uct_device_context.h"
19+
#include "opal/mca/timer/base/base.h"
1920
#include "opal/util/proc.h"
2021

2122
static void mca_btl_uct_endpoint_construct(mca_btl_uct_endpoint_t *endpoint)
@@ -257,21 +258,17 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl,
257258
return OPAL_SUCCESS;
258259
}
259260

260-
static int mca_btl_uct_endpoint_connect_endpoint(
261+
static int mca_btl_uct_endpoint_send_connection_data(
261262
mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl,
262263
mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint,
263-
uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr)
264+
uint8_t *conn_tl_data, int request_type)
264265
{
265-
size_t request_length = sizeof(mca_btl_uct_conn_req_t)
266-
+ MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len;
267-
mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
268266
mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl;
269267
mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0];
270-
mca_btl_uct_conn_req_t *request = alloca(request_length);
268+
mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
271269
uct_device_addr_t *device_addr = NULL;
272270
uct_iface_addr_t *iface_addr;
273271
ucs_status_t ucs_status;
274-
int rc;
275272

276273
assert(NULL != conn_tl);
277274

@@ -302,15 +299,50 @@ static int mca_btl_uct_endpoint_connect_endpoint(
302299
ucs_status));
303300
return OPAL_ERROR;
304301
}
305-
} else {
306-
OBJ_RETAIN(conn_ep);
307302
}
308303

304+
size_t request_length = sizeof(mca_btl_uct_conn_req_t)
305+
+ MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len;
306+
mca_btl_uct_conn_req_t *request = alloca(request_length);
307+
309308
/* fill in common request parameters */
310309
request->proc_name = OPAL_PROC_MY_NAME;
311310
request->context_id = tl_context->context_id;
312311
request->tl_index = tl->tl_index;
313-
request->type = !!(ep_addr);
312+
request->type = request_type;
313+
314+
/* fill in connection request */
315+
ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr);
316+
if (UCS_OK != ucs_status) {
317+
/* this is a fatal a fatal error */
318+
OBJ_RELEASE(endpoint->conn_ep);
319+
uct_ep_destroy(tl_endpoint->uct_ep);
320+
tl_endpoint->uct_ep = NULL;
321+
return OPAL_ERROR;
322+
}
323+
324+
/* let the remote side know that the connection has been established and
325+
* wait for the message to be sent */
326+
int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request,
327+
request_length);
328+
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
329+
OBJ_RELEASE(endpoint->conn_ep);
330+
uct_ep_destroy(tl_endpoint->uct_ep);
331+
tl_endpoint->uct_ep = NULL;
332+
return OPAL_ERROR;
333+
}
334+
335+
tl_endpoint->last_connection_req = opal_timer_base_get_usec();
336+
337+
return OPAL_SUCCESS;
338+
}
339+
340+
static int mca_btl_uct_endpoint_connect_endpoint(
341+
mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl,
342+
mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint,
343+
uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr)
344+
{
345+
ucs_status_t ucs_status;
314346

315347
if (NULL == tl_endpoint->uct_ep) {
316348
BTL_VERBOSE(("allocating endpoint for peer %s and sending connection data",
@@ -338,29 +370,15 @@ static int mca_btl_uct_endpoint_connect_endpoint(
338370
}
339371
}
340372

341-
/* fill in connection request */
342-
ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr);
343-
if (UCS_OK != ucs_status) {
344-
/* this is a fatal a fatal error */
345-
OBJ_RELEASE(endpoint->conn_ep);
346-
uct_ep_destroy(tl_endpoint->uct_ep);
347-
tl_endpoint->uct_ep = NULL;
348-
return OPAL_ERROR;
349-
}
350-
351-
/* let the remote side know that the connection has been established and
352-
* wait for the message to be sent */
353-
rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request,
354-
request_length);
355-
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
356-
OBJ_RELEASE(endpoint->conn_ep);
357-
uct_ep_destroy(tl_endpoint->uct_ep);
358-
tl_endpoint->uct_ep = NULL;
359-
return OPAL_ERROR;
373+
opal_timer_t now = opal_timer_base_get_usec();
374+
if ((now - tl_endpoint->last_connection_req) < mca_btl_uct_component.connection_retry_timeout && !ep_addr) {
375+
return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS
376+
: OPAL_ERR_OUT_OF_RESOURCE;
360377
}
361378

362-
return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS
363-
: OPAL_ERR_OUT_OF_RESOURCE;
379+
int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint,
380+
conn_tl_data, /*request_type=*/!!ep_addr);
381+
return (OPAL_SUCCESS == rc) ? OPAL_ERR_OUT_OF_RESOURCE : rc;
364382
}
365383

366384
int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint,

opal/mca/btl/uct/btl_uct_types.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
/*
33
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
44
* reserved.
5+
* Copyright (c) 2025 Google, LLC. All rights reserved.
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -14,6 +15,8 @@
1415

1516
# include "opal/mca/btl/btl.h"
1617

18+
#include "opal/mca/timer/base/base.h"
19+
1720
/* forward declarations */
1821
struct mca_btl_uct_module_t;
1922
struct mca_btl_base_endpoint_t;
@@ -100,6 +103,9 @@ struct mca_btl_uct_tl_endpoint_t {
100103

101104
/** UCT endpoint handle */
102105
uct_ep_h uct_ep;
106+
107+
/** Time of last connection message. */
108+
opal_timer_t last_connection_req;
103109
};
104110

105111
typedef struct mca_btl_uct_tl_endpoint_t mca_btl_uct_tl_endpoint_t;

0 commit comments

Comments
 (0)