Skip to content

Commit 7aeb655

Browse files
authored
Merge pull request #6395 from brminich/topic/ucx_net_waddr_4.0.x
PML/UCX: Use net worker address for remote peers - v4.0.x
2 parents e82523f + 1c51494 commit 7aeb655

File tree

2 files changed

+65
-4
lines changed

2 files changed

+65
-4
lines changed

config/ompi_check_ucx.m4

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,10 @@ AC_DEFUN([OMPI_CHECK_UCX],[
120120
UCP_ATOMIC_FETCH_OP_FXOR],
121121
[], [],
122122
[#include <ucp/api/ucp.h>])
123+
AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS],
124+
[AC_DEFINE([HAVE_UCP_WORKER_ADDRESS_FLAGS], [1],
125+
[have worker address attribute])], [],
126+
[#include <ucp/api/ucp.h>])
123127
CPPFLAGS=$old_CPPFLAGS
124128

125129
OPAL_SUMMARY_ADD([[Transports]],[[Open UCX]],[$1],[$ompi_check_ucx_happy])])])

ompi/mca/pml/ucx/pml_ucx.c

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,46 @@ mca_pml_ucx_module_t ompi_pml_ucx = {
8282
#define PML_UCX_REQ_ALLOCA() \
8383
((char *)alloca(ompi_pml_ucx.request_size) + ompi_pml_ucx.request_size);
8484

85+
#if HAVE_UCP_WORKER_ADDRESS_FLAGS
86+
static int mca_pml_ucx_send_worker_address_type(int addr_flags, int modex_scope)
87+
{
88+
ucs_status_t status;
89+
ucp_worker_attr_t attrs;
90+
int rc;
91+
92+
attrs.field_mask = UCP_WORKER_ATTR_FIELD_ADDRESS |
93+
UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS;
94+
attrs.address_flags = addr_flags;
95+
96+
status = ucp_worker_query(ompi_pml_ucx.ucp_worker, &attrs);
97+
if (UCS_OK != status) {
98+
PML_UCX_ERROR("Failed to query UCP worker address");
99+
return OMPI_ERROR;
100+
}
101+
102+
OPAL_MODEX_SEND(rc, modex_scope, &mca_pml_ucx_component.pmlm_version,
103+
(void*)attrs.address, attrs.address_length);
104+
105+
ucp_worker_release_address(ompi_pml_ucx.ucp_worker, attrs.address);
106+
107+
if (OMPI_SUCCESS != rc) {
108+
return OMPI_ERROR;
109+
}
110+
111+
PML_UCX_VERBOSE(2, "Pack %s worker address, size %ld",
112+
(modex_scope == OPAL_PMIX_LOCAL) ? "local" : "remote",
113+
attrs.address_length);
114+
115+
return OMPI_SUCCESS;
116+
}
117+
#endif
85118

86119
static int mca_pml_ucx_send_worker_address(void)
87120
{
88-
ucp_address_t *address;
89121
ucs_status_t status;
122+
123+
#if !HAVE_UCP_WORKER_ADDRESS_FLAGS
124+
ucp_address_t *address;
90125
size_t addrlen;
91126
int rc;
92127

@@ -96,16 +131,35 @@ static int mca_pml_ucx_send_worker_address(void)
96131
return OMPI_ERROR;
97132
}
98133

134+
PML_UCX_VERBOSE(2, "Pack worker address, size %ld", addrlen);
135+
99136
OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
100137
&mca_pml_ucx_component.pmlm_version, (void*)address, addrlen);
138+
139+
ucp_worker_release_address(ompi_pml_ucx.ucp_worker, address);
140+
101141
if (OMPI_SUCCESS != rc) {
102-
PML_UCX_ERROR("Open MPI couldn't distribute EP connection details");
103-
return OMPI_ERROR;
142+
goto err;
143+
}
144+
#else
145+
/* Pack just network device addresses for remote node peers */
146+
status = mca_pml_ucx_send_worker_address_type(UCP_WORKER_ADDRESS_FLAG_NET_ONLY,
147+
OPAL_PMIX_REMOTE);
148+
if (UCS_OK != status) {
149+
goto err;
104150
}
105151

106-
ucp_worker_release_address(ompi_pml_ucx.ucp_worker, address);
152+
status = mca_pml_ucx_send_worker_address_type(0, OPAL_PMIX_LOCAL);
153+
if (UCS_OK != status) {
154+
goto err;
155+
}
156+
#endif
107157

108158
return OMPI_SUCCESS;
159+
160+
err:
161+
PML_UCX_ERROR("Open MPI couldn't distribute EP connection details");
162+
return OMPI_ERROR;
109163
}
110164

111165
static int mca_pml_ucx_recv_worker_address(ompi_proc_t *proc,
@@ -121,6 +175,9 @@ static int mca_pml_ucx_recv_worker_address(ompi_proc_t *proc,
121175
PML_UCX_ERROR("Failed to receive UCX worker address: %s (%d)",
122176
opal_strerror(ret), ret);
123177
}
178+
179+
PML_UCX_VERBOSE(2, "Got proc %d address, size %ld",
180+
proc->super.proc_name.vpid, *addrlen_p);
124181
return ret;
125182
}
126183

0 commit comments

Comments
 (0)