Skip to content

Commit e7ef1be

Browse files
committed
sessions: add support for ucx more
Greatly simplify support for MPI_Comm_create_from_group and MPI_Intercomm_create_from_group by removing the need to support the 128-bit excid notion. Instead, make use of a PMIx capability - PMIX_GROUP_LOCAL_CID and the notion of PMIX_GROUP_INFO. This capability was introduced in Open PMIx 4.1.3. This capability allows us to piggy-back a local cid selected for the new communicator on the PMIx_Group_construct operation. Using this approach, a lot of the complex active message style operations implemented in the OB1 PML to support excids can be avoided. This PR also includes simplifications to the OFI MTL to make use of the PMIX_GROUP_LOCAL_CID feature. Infrastructure for debugging communicator management routines was also introduced, along with a new MCA parameter - mpi_comm_verbose. Related to #12566 Signed-off-by: Howard Pritchard <[email protected]>
1 parent 20b900e commit e7ef1be

13 files changed

+327
-394
lines changed

ompi/communicator/comm.c

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1509,9 +1509,6 @@ static int ompi_comm_idup_with_info_activate (ompi_comm_request_t *request)
15091509

15101510
static int ompi_comm_idup_with_info_finish (ompi_comm_request_t *request)
15111511
{
1512-
ompi_comm_idup_with_info_context_t *context =
1513-
(ompi_comm_idup_with_info_context_t *) request->context;
1514-
15151512
/* done */
15161513
return MPI_SUCCESS;
15171514
}
@@ -1741,7 +1738,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
17411738
ompi_communicator_t **newintercomm)
17421739
{
17431740
ompi_communicator_t *newcomp = NULL, *local_comm, *leader_comm = MPI_COMM_NULL;
1744-
ompi_comm_extended_cid_block_t new_block;
1741+
ompi_comm_extended_cid_block_t new_block = {{0}};
17451742
bool i_am_leader = local_leader == local_group->grp_my_rank;
17461743
ompi_proc_t **rprocs;
17471744
uint64_t data[4];
@@ -1867,14 +1864,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
18671864
return rc;
18681865
}
18691866

1870-
/* will be using a communicator ID derived from the bridge communicator to save some time */
1871-
new_block.block_cid.cid_base = data[1];
1872-
new_block.block_cid.cid_sub.u64 = data[2];
1873-
new_block.block_nextsub = 0;
1874-
new_block.block_nexttag = 0;
1875-
new_block.block_level = (int8_t) data[3];
1876-
1877-
rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, &new_block, false, OMPI_COMM_CID_GROUP_NEW);
1867+
rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, NULL, false, OMPI_COMM_CID_GROUP_NEW);
18781868
if ( OMPI_SUCCESS != rc ) {
18791869
OBJ_RELEASE(newcomp);
18801870
return rc;

ompi/communicator/comm_cid.c

Lines changed: 149 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -310,21 +310,16 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
310310
const void *arg0, const void *arg1, bool send_first, int mode,
311311
ompi_request_t **req)
312312
{
313-
pmix_info_t pinfo, *results = NULL;
313+
pmix_info_t *pinfo, *results = NULL;
314314
size_t nresults;
315-
opal_process_name_t *name_array = NULL;
316-
char *tag = NULL;
317-
size_t proc_count;
318-
size_t cid_base = 0;
315+
opal_process_name_t opal_proc_name;
319316
bool cid_base_set = false;
317+
char *tag = NULL;
318+
size_t proc_count = 0, rproc_count = 0, cid_base = 0UL, ninfo;
320319
int rc, leader_rank;
321-
int ret = OMPI_SUCCESS;
322-
pmix_proc_t *procs = NULL;
323-
324-
rc = ompi_group_to_proc_name_array (newcomm->c_local_group, &name_array, &proc_count);
325-
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
326-
return rc;
327-
}
320+
pmix_proc_t *procs;
321+
void *grpinfo = NULL, *list = NULL;
322+
pmix_data_array_t darray;
328323

329324
switch (mode) {
330325
case OMPI_COMM_CID_GROUP_NEW:
@@ -341,15 +336,71 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
341336
break;
342337
}
343338

344-
PMIX_INFO_LOAD(&pinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL);
339+
grpinfo = PMIx_Info_list_start();
340+
if (NULL == grpinfo) {
341+
return OMPI_ERR_OUT_OF_RESOURCE ;
342+
}
343+
344+
rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL);
345+
if (PMIX_SUCCESS != rc) {
346+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
347+
return OMPI_ERR_OUT_OF_RESOURCE ;
348+
}
349+
350+
list = PMIx_Info_list_start();
351+
352+
size_t c_index = (size_t)newcomm->c_index;
353+
rc = PMIx_Info_list_add(list, PMIX_GROUP_LOCAL_CID, &c_index, PMIX_SIZE);
354+
if (PMIX_SUCCESS != rc) {
355+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
356+
return OMPI_ERR_OUT_OF_RESOURCE ;
357+
}
358+
359+
rc = PMIx_Info_list_convert(list, &darray);
360+
if (PMIX_SUCCESS != rc) {
361+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__));
362+
return OMPI_ERR_OUT_OF_RESOURCE ;
363+
}
364+
rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_INFO, &darray, PMIX_DATA_ARRAY);
365+
if (PMIX_SUCCESS != rc) {
366+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
367+
return OMPI_ERR_OUT_OF_RESOURCE ;
368+
}
369+
PMIx_Info_list_release(list);
370+
PMIX_DATA_ARRAY_DESTRUCT(&darray);
371+
372+
373+
rc = PMIx_Info_list_convert(grpinfo, &darray);
374+
if (PMIX_SUCCESS != rc) {
375+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__));
376+
return OMPI_ERR_OUT_OF_RESOURCE ;
377+
}
378+
379+
pinfo = (pmix_info_t*)darray.array;
380+
ninfo = darray.size;
381+
PMIx_Info_list_release(grpinfo);
382+
383+
proc_count = newcomm->c_local_group->grp_proc_count;
384+
if ( OMPI_COMM_IS_INTER (newcomm) ){
385+
rproc_count = newcomm->c_remote_group->grp_proc_count;
386+
}
387+
388+
PMIX_PROC_CREATE(procs, proc_count + rproc_count);
345389

346-
PMIX_PROC_CREATE(procs, proc_count);
347390
for (size_t i = 0 ; i < proc_count; ++i) {
348-
OPAL_PMIX_CONVERT_NAME(&procs[i],&name_array[i]);
391+
opal_proc_name = ompi_group_get_proc_name(newcomm->c_local_group, i);
392+
OPAL_PMIX_CONVERT_NAME(&procs[i],&opal_proc_name);
393+
}
394+
for (size_t i = 0; i < rproc_count; ++i) {
395+
opal_proc_name = ompi_group_get_proc_name(newcomm->c_remote_group, i);
396+
OPAL_PMIX_CONVERT_NAME(&procs[proc_count+i],&opal_proc_name);
349397
}
350398

351-
rc = PMIx_Group_construct(tag, procs, proc_count, &pinfo, 1, &results, &nresults);
352-
PMIX_INFO_DESTRUCT(&pinfo);
399+
400+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n",
401+
tag, proc_count + rproc_count, ninfo, cid_base));
402+
rc = PMIx_Group_construct(tag, procs, proc_count + rproc_count, pinfo, ninfo, &results, &nresults);
403+
PMIX_DATA_ARRAY_DESTRUCT(&darray);
353404
if(PMIX_SUCCESS != rc) {
354405
char msg_string[1024];
355406
switch (rc) {
@@ -361,7 +412,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
361412
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups",
362413
msg_string);
363414

364-
ret = MPI_ERR_UNSUPPORTED_OPERATION;
415+
rc = MPI_ERR_UNSUPPORTED_OPERATION;
365416
break;
366417
case PMIX_ERR_NOT_SUPPORTED:
367418
sprintf(msg_string,"PMIx server does not support PMIx Group operations");
@@ -370,10 +421,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
370421
true,
371422
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups",
372423
msg_string);
373-
ret = MPI_ERR_UNSUPPORTED_OPERATION;
424+
rc = MPI_ERR_UNSUPPORTED_OPERATION;
374425
break;
375426
default:
376-
ret = opal_pmix_convert_status(rc);
427+
rc = opal_pmix_convert_status(rc);
377428
break;
378429
}
379430
goto fn_exit;
@@ -383,23 +434,28 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
383434
if (PMIX_CHECK_KEY(&results[i], PMIX_GROUP_CONTEXT_ID)) {
384435
PMIX_VALUE_GET_NUMBER(rc, &results[i].value, cid_base, size_t);
385436
if(PMIX_SUCCESS != rc) {
386-
ret = opal_pmix_convert_status(rc);
437+
rc = opal_pmix_convert_status(rc);
387438
goto fn_exit;
388439
}
389440
cid_base_set = true;
390441
break;
391442
}
392443
}
393444

445+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n",
446+
tag, proc_count + rproc_count, ninfo, cid_base));
447+
448+
/* destruct the group */
394449
rc = PMIx_Group_destruct (tag, NULL, 0);
395450
if(PMIX_SUCCESS != rc) {
396-
ret = opal_pmix_convert_status(rc);
451+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_destruct failed %s", PMIx_Error_string(rc)));
452+
rc = opal_pmix_convert_status(rc);
397453
goto fn_exit;
398454
}
399455

400456
if (!cid_base_set) {
401457
opal_show_help("help-comm.txt", "cid-base-not-set", true);
402-
ret = OMPI_ERROR;
458+
rc = OMPI_ERROR;
403459
goto fn_exit;
404460
}
405461

@@ -416,12 +472,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
416472
procs = NULL;
417473
}
418474

419-
if(NULL != name_array) {
420-
free (name_array);
421-
name_array = NULL;
422-
}
423-
424-
return ret;
475+
return rc;
425476
}
426477

427478
static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm,
@@ -446,6 +497,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
446497
block = &comm->c_contextidb;
447498
}
448499

500+
for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) {
501+
bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm);
502+
if (true == flag) {
503+
newcomm->c_index = i;
504+
break;
505+
}
506+
}
507+
assert(newcomm->c_index > 2);
508+
449509
if (NULL == arg1) {
450510
if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode ||
451511
!ompi_comm_extended_cid_block_available (&comm->c_contextidb)) {
@@ -464,18 +524,11 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
464524
is_new_block = true;
465525
}
466526

527+
467528
if (block != &newcomm->c_contextidb) {
468529
(void) ompi_comm_extended_cid_block_new (block, &newcomm->c_contextidb, is_new_block);
469530
}
470531

471-
for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) {
472-
bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm);
473-
if (true == flag) {
474-
newcomm->c_index = i;
475-
break;
476-
}
477-
}
478-
479532
newcomm->c_contextid = newcomm->c_contextidb.block_cid;
480533

481534
opal_hash_table_set_value_ptr (&ompi_comm_hash, &newcomm->c_contextid,
@@ -498,7 +551,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
498551

499552
/* old CID algorighm */
500553

501-
/* if we got here and comm is NULL then that means the app is invoking MPI-4 Sessions or later
554+
/* if we got here and comm is NULL then that means the app is invoking MPI-4 Sessions or later
502555
functions but the pml does not support these functions so return not supported */
503556
if (NULL == comm) {
504557
char msg_string[1024];
@@ -963,6 +1016,64 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
9631016
return rc;
9641017
}
9651018

1019+
int ompi_comm_get_remote_cid (ompi_communicator_t *comm, int dest, uint32_t *remote_cid)
1020+
{
1021+
ompi_proc_t *ompi_proc;
1022+
pmix_proc_t pmix_proc;
1023+
pmix_info_t tinfo[2];
1024+
pmix_value_t *val = NULL;
1025+
ompi_comm_extended_cid_t excid;
1026+
int rc = OMPI_SUCCESS;
1027+
size_t remote_cid64;
1028+
1029+
assert(NULL != remote_cid);
1030+
1031+
if (OMPI_COMM_IS_GLOBAL_INDEX(comm)) {
1032+
*remote_cid = comm->c_index;
1033+
} else {
1034+
ompi_proc = ompi_comm_peer_lookup(comm, dest);
1035+
OPAL_PMIX_CONVERT_NAME(&pmix_proc, &ompi_proc->super.proc_name);
1036+
1037+
PMIx_Info_construct(&tinfo[0]);
1038+
PMIX_INFO_LOAD(&tinfo[0], PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32);
1039+
1040+
excid = ompi_comm_get_extended_cid (comm);
1041+
1042+
PMIX_INFO_CONSTRUCT(&tinfo[1]);
1043+
PMIX_INFO_LOAD(&tinfo[1], PMIX_GROUP_CONTEXT_ID, &excid.cid_base, PMIX_SIZE);
1044+
PMIX_INFO_SET_QUALIFIER(&tinfo[1]);
1045+
if (PMIX_SUCCESS != (rc = PMIx_Get(&pmix_proc, PMIX_GROUP_LOCAL_CID, tinfo, 2, &val))) {
1046+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s", excid.cid_base, PMIx_Error_string(rc)));
1047+
}
1048+
1049+
if (NULL == val) {
1050+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL"));
1051+
rc = OMPI_ERR_NOT_FOUND;
1052+
goto done;
1053+
}
1054+
1055+
if (val->type != PMIX_SIZE) {
1056+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch"));
1057+
rc = OMPI_ERR_TYPE_MISMATCH;
1058+
goto done;
1059+
}
1060+
1061+
if (PMIX_SUCCESS == rc) {
1062+
PMIX_VALUE_GET_NUMBER(rc, val, remote_cid64, size_t);
1063+
rc = OMPI_SUCCESS;
1064+
*remote_cid = (uint32_t)remote_cid64;
1065+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld", *remote_cid, excid.cid_base));
1066+
}
1067+
}
1068+
1069+
done:
1070+
if (NULL != val) {
1071+
PMIX_VALUE_RELEASE(val);
1072+
}
1073+
1074+
return rc;
1075+
}
1076+
9661077
static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request)
9671078
{
9681079
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;

ompi/communicator/comm_init.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
* and Technology (RIST). All rights reserved.
2424
* Copyright (c) 2015-2019 Intel, Inc. All rights reserved.
2525
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
26-
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
26+
* Copyright (c) 2018-2024 Triad National Security, LLC. All rights
2727
* reserved.
2828
* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
2929
* Copyright (c) 2023 NVIDIA Corporation. All rights reserved.
@@ -69,6 +69,8 @@ ompi_predefined_communicator_t ompi_mpi_comm_self = {{{{0}}}};
6969
ompi_predefined_communicator_t ompi_mpi_comm_null = {{{{0}}}};
7070
ompi_communicator_t *ompi_mpi_comm_parent = NULL;
7171

72+
int ompi_comm_output = -1;
73+
7274
static bool ompi_comm_intrinsic_init;
7375

7476
ompi_predefined_communicator_t *ompi_mpi_comm_world_addr =
@@ -97,6 +99,14 @@ static int ompi_comm_finalize (void);
9799
*/
98100
int ompi_comm_init(void)
99101
{
102+
103+
/* create output stream */
104+
105+
if (ompi_comm_output == -1) {
106+
ompi_comm_output = opal_output_open(NULL);
107+
opal_output_set_verbosity(ompi_comm_output, ompi_comm_verbose_level);
108+
}
109+
100110
/* Setup communicator array */
101111
OBJ_CONSTRUCT(&ompi_mpi_communicators, opal_pointer_array_t);
102112
if( OPAL_SUCCESS != opal_pointer_array_init(&ompi_mpi_communicators, 16,
@@ -392,6 +402,11 @@ static int ompi_comm_finalize (void)
392402
/* finalize communicator requests */
393403
ompi_comm_request_fini ();
394404

405+
/* close output stream */
406+
407+
opal_output_close(ompi_comm_output);
408+
ompi_comm_output = -1;
409+
395410
/* release a reference to the attributes subsys */
396411
return ompi_attr_put_ref();
397412
}

ompi/communicator/communicator.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,8 @@ OMPI_DECLSPEC extern opal_hash_table_t ompi_comm_hash;
153153
OMPI_DECLSPEC extern opal_pointer_array_t ompi_mpi_communicators;
154154
OMPI_DECLSPEC extern opal_pointer_array_t ompi_comm_f_to_c_table;
155155

156+
OMPI_DECLSPEC extern int ompi_comm_output;
157+
156158
struct ompi_comm_extended_cid_t {
157159
uint64_t cid_base;
158160
union {
@@ -614,6 +616,13 @@ static inline struct ompi_proc_t* ompi_comm_peer_lookup (const ompi_communicator
614616
return ompi_group_peer_lookup(comm->c_remote_group,peer_id);
615617
}
616618

619+
static inline bool ompi_comm_instances_same(const ompi_communicator_t *comm1, const ompi_communicator_t *comm2)
620+
{
621+
return comm1->instance == comm2->instance;
622+
}
623+
624+
int ompi_comm_get_remote_cid (ompi_communicator_t *comm, int dest, uint32_t *remote_cid);
625+
617626
#if OPAL_ENABLE_FT_MPI
618627
/*
619628
* Support for MPI_ANY_SOURCE point-to-point operations

0 commit comments

Comments
 (0)