Skip to content

Commit 0294303

Browse files
committed
ompi/dpm: make procs consistent before calling PMIx_Connect()
ompi_dpm_connect_accept() call PMIx_Connect() to establish connection, with "procs" as an argument. PMIx requires "procs" to be consistent accross clients. When it is used to set up inter-communicator communication, ompi_dpm_connect_accept() does not maintain the order of proc in "procs". This is because the function is called by both MPI_Comm_connect() and MPI_Comm_accept(), and it always put processes in local communicator in "procs" first, followed by processes in remote communicator. However, for caller of MPI_Comm_connect() and MPI_Comm_accept(), local communicator and remote communicator are different. This patch fixed the issue by sorting "procs" before it is used to call PMIx_Connect(), this ensures that "procs" are consistent accross processes. Signed-off-by: Wei Zhang <[email protected]>
1 parent 698be2c commit 0294303

File tree

1 file changed

+17
-0
lines changed

1 file changed

+17
-0
lines changed

ompi/dpm/dpm.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,18 @@ int ompi_dpm_init(void)
9797
return OMPI_SUCCESS;
9898
}
9999

100+
static int compare_pmix_proc(const void *a, const void *b)
101+
{
102+
const pmix_proc_t *proc_a = (pmix_proc_t *)a;
103+
const pmix_proc_t *proc_b = (pmix_proc_t *)b;
104+
105+
int nspace_dif = strncmp(proc_a->nspace, proc_b->nspace, PMIX_MAX_NSLEN);
106+
if (nspace_dif != 0)
107+
return nspace_dif;
108+
109+
return proc_a->rank - proc_b->rank;
110+
}
111+
100112
int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
101113
const char *port_string, bool send_first,
102114
ompi_communicator_t **newcomm)
@@ -378,6 +390,11 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
378390
* so that add_procs will not result in a slew of lookups */
379391
PMIX_INFO_CONSTRUCT(&tinfo);
380392
PMIX_INFO_LOAD(&tinfo, PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32);
393+
394+
/*
395+
* sort procs so that all ranks call PMIx_Connect() with the processes in same order
396+
*/
397+
qsort(procs, nprocs, sizeof(pmix_proc_t), compare_pmix_proc);
381398
pret = PMIx_Connect(procs, nprocs, &tinfo, 1);
382399
PMIX_INFO_DESTRUCT(&tinfo);
383400
PMIX_PROC_FREE(procs, nprocs);

0 commit comments

Comments
 (0)