From 683d1f686d31c2094b49c7e0d12ae5817ce90f84 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 11 Jul 2022 20:25:07 +0000 Subject: [PATCH] ompi/dpm: make procs consistent before calling PMIx_Connect() ompi_dpm_connect_accept() call PMIx_Connect() to establish connection, with "procs" as an argument. PMIx requires "procs" to be consistent accross clients. When it is used to set up inter-communicator communication, ompi_dpm_connect_accept() does not maintain the order of proc in "procs". This is because the function is called by both MPI_Comm_connect() and MPI_Comm_accept(), and it always put processes in local communicator in "procs" first, followed by processes in remote communicator. However, for caller of MPI_Comm_connect() and MPI_Comm_accept(), local communicator and remote communicator are different. This patch fixed the issue by sorting "procs" before it is used to call PMIx_Connect(), this ensures that "procs" are consistent accross processes. Signed-off-by: Wei Zhang (cherry picked from commit 029430375a0b46daf1500061b552395bb293754c) --- ompi/dpm/dpm.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index d45f37b8596..c7948936c24 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -97,6 +97,18 @@ int ompi_dpm_init(void) return OMPI_SUCCESS; } +static int compare_pmix_proc(const void *a, const void *b) +{ + const pmix_proc_t *proc_a = (pmix_proc_t *)a; + const pmix_proc_t *proc_b = (pmix_proc_t *)b; + + int nspace_dif = strncmp(proc_a->nspace, proc_b->nspace, PMIX_MAX_NSLEN); + if (nspace_dif != 0) + return nspace_dif; + + return proc_a->rank - proc_b->rank; +} + int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, const char *port_string, bool send_first, ompi_communicator_t **newcomm) @@ -378,6 +390,11 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, * so that add_procs will not result in a slew of lookups */ PMIX_INFO_CONSTRUCT(&tinfo); PMIX_INFO_LOAD(&tinfo, PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32); + + /* + * sort procs so that all ranks call PMIx_Connect() with the processes in same order + */ + qsort(procs, nprocs, sizeof(pmix_proc_t), compare_pmix_proc); pret = PMIx_Connect(procs, nprocs, &tinfo, 1); PMIX_INFO_DESTRUCT(&tinfo); PMIX_PROC_FREE(procs, nprocs);