From 1c514948f64e532bbf6734d7ce020ed7f20fe9e8 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Wed, 13 Feb 2019 16:23:09 +0200 Subject: [PATCH] PML/UCX: Use net worker address for remote peers For remote node peers pack smaller worker address, which contains network device addresses only. This would reduce amount of OOB traffic during startup. Signed-off-by: Mikhail Brinskii (cherry picked from commit 751d88192d05edb7e1912bab4e48643c6f9e1574) --- config/ompi_check_ucx.m4 | 4 +++ ompi/mca/pml/ucx/pml_ucx.c | 65 +++++++++++++++++++++++++++++++++++--- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index 8a8a6d469c3..668b0ff1478 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -120,6 +120,10 @@ AC_DEFUN([OMPI_CHECK_UCX],[ UCP_ATOMIC_FETCH_OP_FXOR], [], [], [#include ]) + AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS], + [AC_DEFINE([HAVE_UCP_WORKER_ADDRESS_FLAGS], [1], + [have worker address attribute])], [], + [#include ]) CPPFLAGS=$old_CPPFLAGS OPAL_SUMMARY_ADD([[Transports]],[[Open UCX]],[$1],[$ompi_check_ucx_happy])])]) diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index bcb689e1de8..00a95644c22 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -82,11 +82,46 @@ mca_pml_ucx_module_t ompi_pml_ucx = { #define PML_UCX_REQ_ALLOCA() \ ((char *)alloca(ompi_pml_ucx.request_size) + ompi_pml_ucx.request_size); +#if HAVE_UCP_WORKER_ADDRESS_FLAGS +static int mca_pml_ucx_send_worker_address_type(int addr_flags, int modex_scope) +{ + ucs_status_t status; + ucp_worker_attr_t attrs; + int rc; + + attrs.field_mask = UCP_WORKER_ATTR_FIELD_ADDRESS | + UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS; + attrs.address_flags = addr_flags; + + status = ucp_worker_query(ompi_pml_ucx.ucp_worker, &attrs); + if (UCS_OK != status) { + PML_UCX_ERROR("Failed to query UCP worker address"); + return OMPI_ERROR; + } + + OPAL_MODEX_SEND(rc, modex_scope, &mca_pml_ucx_component.pmlm_version, + (void*)attrs.address, attrs.address_length); + + ucp_worker_release_address(ompi_pml_ucx.ucp_worker, attrs.address); + + if (OMPI_SUCCESS != rc) { + return OMPI_ERROR; + } + + PML_UCX_VERBOSE(2, "Pack %s worker address, size %ld", + (modex_scope == OPAL_PMIX_LOCAL) ? "local" : "remote", + attrs.address_length); + + return OMPI_SUCCESS; +} +#endif static int mca_pml_ucx_send_worker_address(void) { - ucp_address_t *address; ucs_status_t status; + +#if !HAVE_UCP_WORKER_ADDRESS_FLAGS + ucp_address_t *address; size_t addrlen; int rc; @@ -96,16 +131,35 @@ static int mca_pml_ucx_send_worker_address(void) return OMPI_ERROR; } + PML_UCX_VERBOSE(2, "Pack worker address, size %ld", addrlen); + OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, &mca_pml_ucx_component.pmlm_version, (void*)address, addrlen); + + ucp_worker_release_address(ompi_pml_ucx.ucp_worker, address); + if (OMPI_SUCCESS != rc) { - PML_UCX_ERROR("Open MPI couldn't distribute EP connection details"); - return OMPI_ERROR; + goto err; + } +#else + /* Pack just network device addresses for remote node peers */ + status = mca_pml_ucx_send_worker_address_type(UCP_WORKER_ADDRESS_FLAG_NET_ONLY, + OPAL_PMIX_REMOTE); + if (UCS_OK != status) { + goto err; } - ucp_worker_release_address(ompi_pml_ucx.ucp_worker, address); + status = mca_pml_ucx_send_worker_address_type(0, OPAL_PMIX_LOCAL); + if (UCS_OK != status) { + goto err; + } +#endif return OMPI_SUCCESS; + +err: + PML_UCX_ERROR("Open MPI couldn't distribute EP connection details"); + return OMPI_ERROR; } static int mca_pml_ucx_recv_worker_address(ompi_proc_t *proc, @@ -121,6 +175,9 @@ static int mca_pml_ucx_recv_worker_address(ompi_proc_t *proc, PML_UCX_ERROR("Failed to receive UCX worker address: %s (%d)", opal_strerror(ret), ret); } + + PML_UCX_VERBOSE(2, "Got proc %d address, size %ld", + proc->super.proc_name.vpid, *addrlen_p); return ret; }