Skip to content

Commit 23865b7

Browse files
committed
sessions: add support for ucx more
Greatly simplify support for MPI_Comm_create_from_group and MPI_Intercomm_create_from_group by removing the need to support the 128-bit excid notion. Instead, make use of a PMIx capability - PMIX_GROUP_LOCAL_CID and the notion of PMIX_GROUP_INFO. This capability was introduced in Open PMIx 4.1.3. This capability allows us to piggy-back a local cid selected for the new communicator on the PMIx_Group_construct operation. Using this approach, a lot of the complex active message style operations implemented in the OB1 PML to support excids can be avoided. This PR also includes simplifications to the OFI MTL to make use of the PMIX_GROUP_LOCAL_CID feature. Infrastructure for debugging communicator management routines was also introduced, along with a new MCA parameter - mpi_comm_verbose. Related to #12566 Signed-off-by: Howard Pritchard <[email protected]>
1 parent 20b900e commit 23865b7

13 files changed

+330
-382
lines changed

ompi/communicator/comm.c

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1509,9 +1509,6 @@ static int ompi_comm_idup_with_info_activate (ompi_comm_request_t *request)
15091509

15101510
static int ompi_comm_idup_with_info_finish (ompi_comm_request_t *request)
15111511
{
1512-
ompi_comm_idup_with_info_context_t *context =
1513-
(ompi_comm_idup_with_info_context_t *) request->context;
1514-
15151512
/* done */
15161513
return MPI_SUCCESS;
15171514
}
@@ -1741,7 +1738,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
17411738
ompi_communicator_t **newintercomm)
17421739
{
17431740
ompi_communicator_t *newcomp = NULL, *local_comm, *leader_comm = MPI_COMM_NULL;
1744-
ompi_comm_extended_cid_block_t new_block;
1741+
ompi_comm_extended_cid_block_t new_block = {{0}};
17451742
bool i_am_leader = local_leader == local_group->grp_my_rank;
17461743
ompi_proc_t **rprocs;
17471744
uint64_t data[4];
@@ -1867,14 +1864,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
18671864
return rc;
18681865
}
18691866

1870-
/* will be using a communicator ID derived from the bridge communicator to save some time */
1871-
new_block.block_cid.cid_base = data[1];
1872-
new_block.block_cid.cid_sub.u64 = data[2];
1873-
new_block.block_nextsub = 0;
1874-
new_block.block_nexttag = 0;
1875-
new_block.block_level = (int8_t) data[3];
1876-
1877-
rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, &new_block, false, OMPI_COMM_CID_GROUP_NEW);
1867+
rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, NULL, false, OMPI_COMM_CID_GROUP_NEW);
18781868
if ( OMPI_SUCCESS != rc ) {
18791869
OBJ_RELEASE(newcomp);
18801870
return rc;

ompi/communicator/comm_cid.c

Lines changed: 152 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -310,22 +310,41 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
310310
const void *arg0, const void *arg1, bool send_first, int mode,
311311
ompi_request_t **req)
312312
{
313-
pmix_info_t pinfo, *results = NULL;
313+
pmix_info_t *pinfo, *results = NULL;
314314
size_t nresults;
315-
opal_process_name_t *name_array = NULL;
316-
char *tag = NULL;
317-
size_t proc_count;
318-
size_t cid_base = 0;
315+
opal_process_name_t *name_array, *rname_array, *tmp_name_array;
319316
bool cid_base_set = false;
317+
char *tag = NULL;
318+
size_t proc_count, rproc_count, cid_base = 0UL, ninfo;
320319
int rc, leader_rank;
321-
int ret = OMPI_SUCCESS;
322-
pmix_proc_t *procs = NULL;
320+
pmix_proc_t *procs;
321+
void *grpinfo = NULL, *list = NULL;
322+
pmix_data_array_t darray;
323+
char tmp[PMIX_MAX_KEYLEN];
323324

324325
rc = ompi_group_to_proc_name_array (newcomm->c_local_group, &name_array, &proc_count);
325326
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
326327
return rc;
327328
}
328329

330+
if ( OMPI_COMM_IS_INTER (newcomm) ){
331+
rc = ompi_group_to_proc_name_array (newcomm->c_remote_group, &rname_array, &rproc_count);
332+
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
333+
free (name_array);
334+
return rc;
335+
}
336+
tmp_name_array = (opal_process_name_t *)realloc(name_array, (proc_count + rproc_count) * sizeof(opal_process_name_t));
337+
if (NULL == tmp) {
338+
free(name_array);
339+
free(rname_array);
340+
return OMPI_ERR_OUT_OF_RESOURCE ;
341+
}
342+
name_array = tmp_name_array;
343+
memcpy(&name_array[proc_count], rname_array, rproc_count * sizeof(opal_process_name_t));
344+
proc_count += rproc_count;
345+
free(rname_array);
346+
}
347+
329348
switch (mode) {
330349
case OMPI_COMM_CID_GROUP_NEW:
331350
tag = (char *) arg0;
@@ -341,15 +360,58 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
341360
break;
342361
}
343362

344-
PMIX_INFO_LOAD(&pinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL);
363+
grpinfo = PMIx_Info_list_start();
364+
if (NULL == grpinfo) {
365+
return OMPI_ERR_OUT_OF_RESOURCE ;
366+
}
367+
368+
rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL);
369+
if (PMIX_SUCCESS != rc) {
370+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
371+
return OMPI_ERR_OUT_OF_RESOURCE ;
372+
}
373+
374+
list = PMIx_Info_list_start();
375+
376+
size_t c_index = (size_t)newcomm->c_index;
377+
rc = PMIx_Info_list_add(list, PMIX_GROUP_LOCAL_CID, &c_index, PMIX_SIZE);
378+
if (PMIX_SUCCESS != rc) {
379+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
380+
return OMPI_ERR_OUT_OF_RESOURCE ;
381+
}
382+
383+
rc = PMIx_Info_list_convert(list, &darray);
384+
if (PMIX_SUCCESS != rc) {
385+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__));
386+
return OMPI_ERR_OUT_OF_RESOURCE ;
387+
}
388+
rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_INFO, &darray, PMIX_DATA_ARRAY);
389+
if (PMIX_SUCCESS != rc) {
390+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
391+
return OMPI_ERR_OUT_OF_RESOURCE ;
392+
}
393+
PMIx_Info_list_release(list);
394+
PMIX_DATA_ARRAY_DESTRUCT(&darray);
395+
396+
397+
rc = PMIx_Info_list_convert(grpinfo, &darray);
398+
if (PMIX_SUCCESS != rc) {
399+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__));
400+
return OMPI_ERR_OUT_OF_RESOURCE ;
401+
}
402+
403+
pinfo = (pmix_info_t*)darray.array;
404+
ninfo = darray.size;
405+
PMIx_Info_list_release(grpinfo);
345406

346407
PMIX_PROC_CREATE(procs, proc_count);
347408
for (size_t i = 0 ; i < proc_count; ++i) {
348409
OPAL_PMIX_CONVERT_NAME(&procs[i],&name_array[i]);
349410
}
350411

351-
rc = PMIx_Group_construct(tag, procs, proc_count, &pinfo, 1, &results, &nresults);
352-
PMIX_INFO_DESTRUCT(&pinfo);
412+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n", tag, proc_count, ninfo, cid_base));
413+
rc = PMIx_Group_construct(tag, procs, proc_count, pinfo, ninfo, &results, &nresults);
414+
PMIX_DATA_ARRAY_DESTRUCT(&darray);
353415
if(PMIX_SUCCESS != rc) {
354416
char msg_string[1024];
355417
switch (rc) {
@@ -361,7 +423,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
361423
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups",
362424
msg_string);
363425

364-
ret = MPI_ERR_UNSUPPORTED_OPERATION;
426+
rc = MPI_ERR_UNSUPPORTED_OPERATION;
365427
break;
366428
case PMIX_ERR_NOT_SUPPORTED:
367429
sprintf(msg_string,"PMIx server does not support PMIx Group operations");
@@ -370,10 +432,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
370432
true,
371433
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups",
372434
msg_string);
373-
ret = MPI_ERR_UNSUPPORTED_OPERATION;
435+
rc = MPI_ERR_UNSUPPORTED_OPERATION;
374436
break;
375437
default:
376-
ret = opal_pmix_convert_status(rc);
438+
rc = opal_pmix_convert_status(rc);
377439
break;
378440
}
379441
goto fn_exit;
@@ -383,23 +445,27 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
383445
if (PMIX_CHECK_KEY(&results[i], PMIX_GROUP_CONTEXT_ID)) {
384446
PMIX_VALUE_GET_NUMBER(rc, &results[i].value, cid_base, size_t);
385447
if(PMIX_SUCCESS != rc) {
386-
ret = opal_pmix_convert_status(rc);
448+
rc = opal_pmix_convert_status(rc);
387449
goto fn_exit;
388450
}
389451
cid_base_set = true;
390452
break;
391453
}
392454
}
393455

456+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n", tag, proc_count, ninfo, cid_base));
457+
458+
/* destruct the group */
394459
rc = PMIx_Group_destruct (tag, NULL, 0);
395460
if(PMIX_SUCCESS != rc) {
396-
ret = opal_pmix_convert_status(rc);
461+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_destruct failed %s", PMIx_Error_string(rc)));
462+
rc = opal_pmix_convert_status(rc);
397463
goto fn_exit;
398464
}
399465

400466
if (!cid_base_set) {
401467
opal_show_help("help-comm.txt", "cid-base-not-set", true);
402-
ret = OMPI_ERROR;
468+
rc = OMPI_ERROR;
403469
goto fn_exit;
404470
}
405471

@@ -421,7 +487,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
421487
name_array = NULL;
422488
}
423489

424-
return ret;
490+
return rc;
425491
}
426492

427493
static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm,
@@ -446,6 +512,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
446512
block = &comm->c_contextidb;
447513
}
448514

515+
for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) {
516+
bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm);
517+
if (true == flag) {
518+
newcomm->c_index = i;
519+
break;
520+
}
521+
}
522+
assert(newcomm->c_index > 2);
523+
449524
if (NULL == arg1) {
450525
if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode ||
451526
!ompi_comm_extended_cid_block_available (&comm->c_contextidb)) {
@@ -464,18 +539,11 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
464539
is_new_block = true;
465540
}
466541

542+
467543
if (block != &newcomm->c_contextidb) {
468544
(void) ompi_comm_extended_cid_block_new (block, &newcomm->c_contextidb, is_new_block);
469545
}
470546

471-
for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) {
472-
bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm);
473-
if (true == flag) {
474-
newcomm->c_index = i;
475-
break;
476-
}
477-
}
478-
479547
newcomm->c_contextid = newcomm->c_contextidb.block_cid;
480548

481549
opal_hash_table_set_value_ptr (&ompi_comm_hash, &newcomm->c_contextid,
@@ -498,7 +566,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
498566

499567
/* old CID algorighm */
500568

501-
/* if we got here and comm is NULL then that means the app is invoking MPI-4 Sessions or later
569+
/* if we got here and comm is NULL then that means the app is invoking MPI-4 Sessions or later
502570
functions but the pml does not support these functions so return not supported */
503571
if (NULL == comm) {
504572
char msg_string[1024];
@@ -963,6 +1031,64 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
9631031
return rc;
9641032
}
9651033

1034+
int ompi_comm_get_remote_cid (ompi_communicator_t *comm, int dest, uint32_t *remote_cid)
1035+
{
1036+
ompi_proc_t *ompi_proc;
1037+
pmix_proc_t pmix_proc;
1038+
pmix_info_t tinfo[2];
1039+
pmix_value_t *val = NULL;
1040+
ompi_comm_extended_cid_t excid;
1041+
int rc = OMPI_SUCCESS;
1042+
size_t remote_cid64;
1043+
1044+
assert(NULL != remote_cid);
1045+
1046+
if (OMPI_COMM_IS_GLOBAL_INDEX(comm)) {
1047+
*remote_cid = comm->c_index;
1048+
} else {
1049+
ompi_proc = ompi_comm_peer_lookup(comm, dest);
1050+
OPAL_PMIX_CONVERT_NAME(&pmix_proc, &ompi_proc->super.proc_name);
1051+
1052+
PMIx_Info_construct(&tinfo[0]);
1053+
PMIX_INFO_LOAD(&tinfo[0], PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32);
1054+
1055+
excid = ompi_comm_get_extended_cid (comm);
1056+
1057+
PMIX_INFO_CONSTRUCT(&tinfo[1]);
1058+
PMIX_INFO_LOAD(&tinfo[1], PMIX_GROUP_CONTEXT_ID, &excid.cid_base, PMIX_SIZE);
1059+
PMIX_INFO_SET_QUALIFIER(&tinfo[1]);
1060+
if (PMIX_SUCCESS != (rc = PMIx_Get(&pmix_proc, PMIX_GROUP_LOCAL_CID, tinfo, 2, &val))) {
1061+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s", excid.cid_base, PMIx_Error_string(rc)));
1062+
}
1063+
1064+
if (NULL == val) {
1065+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL"));
1066+
rc = OMPI_ERR_NOT_FOUND;
1067+
goto done;
1068+
}
1069+
1070+
if (val->type != PMIX_SIZE) {
1071+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch"));
1072+
rc = OMPI_ERR_TYPE_MISMATCH;
1073+
goto done;
1074+
}
1075+
1076+
if (PMIX_SUCCESS == rc) {
1077+
PMIX_VALUE_GET_NUMBER(rc, val, remote_cid64, size_t);
1078+
rc = OMPI_SUCCESS;
1079+
*remote_cid = (uint32_t)remote_cid64;
1080+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld", *remote_cid, excid.cid_base));
1081+
}
1082+
}
1083+
1084+
done:
1085+
if (NULL != val) {
1086+
PMIX_VALUE_RELEASE(val);
1087+
}
1088+
1089+
return rc;
1090+
}
1091+
9661092
static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request)
9671093
{
9681094
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;

ompi/communicator/comm_init.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
* and Technology (RIST). All rights reserved.
2424
* Copyright (c) 2015-2019 Intel, Inc. All rights reserved.
2525
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
26-
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
26+
* Copyright (c) 2018-2024 Triad National Security, LLC. All rights
2727
* reserved.
2828
* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
2929
* Copyright (c) 2023 NVIDIA Corporation. All rights reserved.
@@ -69,6 +69,8 @@ ompi_predefined_communicator_t ompi_mpi_comm_self = {{{{0}}}};
6969
ompi_predefined_communicator_t ompi_mpi_comm_null = {{{{0}}}};
7070
ompi_communicator_t *ompi_mpi_comm_parent = NULL;
7171

72+
int ompi_comm_output = -1;
73+
7274
static bool ompi_comm_intrinsic_init;
7375

7476
ompi_predefined_communicator_t *ompi_mpi_comm_world_addr =
@@ -97,6 +99,14 @@ static int ompi_comm_finalize (void);
9799
*/
98100
int ompi_comm_init(void)
99101
{
102+
103+
/* create output stream */
104+
105+
if (ompi_comm_output == -1) {
106+
ompi_comm_output = opal_output_open(NULL);
107+
opal_output_set_verbosity(ompi_comm_output, ompi_comm_verbose_level);
108+
}
109+
100110
/* Setup communicator array */
101111
OBJ_CONSTRUCT(&ompi_mpi_communicators, opal_pointer_array_t);
102112
if( OPAL_SUCCESS != opal_pointer_array_init(&ompi_mpi_communicators, 16,
@@ -392,6 +402,11 @@ static int ompi_comm_finalize (void)
392402
/* finalize communicator requests */
393403
ompi_comm_request_fini ();
394404

405+
/* close output stream */
406+
407+
opal_output_close(ompi_comm_output);
408+
ompi_comm_output = -1;
409+
395410
/* release a reference to the attributes subsys */
396411
return ompi_attr_put_ref();
397412
}

ompi/communicator/communicator.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,8 @@ OMPI_DECLSPEC extern opal_hash_table_t ompi_comm_hash;
153153
OMPI_DECLSPEC extern opal_pointer_array_t ompi_mpi_communicators;
154154
OMPI_DECLSPEC extern opal_pointer_array_t ompi_comm_f_to_c_table;
155155

156+
OMPI_DECLSPEC extern int ompi_comm_output;
157+
156158
struct ompi_comm_extended_cid_t {
157159
uint64_t cid_base;
158160
union {
@@ -614,6 +616,13 @@ static inline struct ompi_proc_t* ompi_comm_peer_lookup (const ompi_communicator
614616
return ompi_group_peer_lookup(comm->c_remote_group,peer_id);
615617
}
616618

619+
static inline bool ompi_comm_instances_same(const ompi_communicator_t *comm1, const ompi_communicator_t *comm2)
620+
{
621+
return comm1->instance == comm2->instance;
622+
}
623+
624+
int ompi_comm_get_remote_cid (ompi_communicator_t *comm, int dest, uint32_t *remote_cid);
625+
617626
#if OPAL_ENABLE_FT_MPI
618627
/*
619628
* Support for MPI_ANY_SOURCE point-to-point operations

0 commit comments

Comments
 (0)