Skip to content

Commit 6b7f4a1

Browse files
committed
sessions: add support for ucx more
Greatly simplify support for MPI_Comm_create_from_group and MPI_Intercomm_create_from_group by removing the need to support the 128-bit excid notion. Instead, make use of a PMIx capability - PMIX_GROUP_LOCAL_CID and the notion of PMIX_GROUP_INFO. This capability was introduced in Open PMIx 4.1.3. This capability allows us to piggy-back a local cid selected for the new communicator on the PMIx_Group_construct operation. Using this approach, a lot of the complex active message style operations implemented in the OB1 PML to support excids can be avoided. This PR also includes simplifications to the OFI MTL to make use of the PMIX_GROUP_LOCAL_CID feature. Infrastructure for debugging communicator management routines was also introduced, along with a new MCA parameter - mpi_comm_verbose. Related to #12566 Signed-off-by: Howard Pritchard <[email protected]>
1 parent f31a9be commit 6b7f4a1

File tree

12 files changed

+293
-470
lines changed

12 files changed

+293
-470
lines changed

ompi/communicator/comm.c

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* Copyright (c) 2015 Mellanox Technologies. All rights reserved.
2525
* Copyright (c) 2017-2022 IBM Corporation. All rights reserved.
2626
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
27-
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
27+
* Copyright (c) 2018-2024 Triad National Security, LLC. All rights
2828
* reserved.
2929
* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
3030
* $COPYRIGHT$
@@ -1738,7 +1738,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
17381738
ompi_communicator_t **newintercomm)
17391739
{
17401740
ompi_communicator_t *newcomp = NULL, *local_comm, *leader_comm = MPI_COMM_NULL;
1741-
ompi_comm_extended_cid_block_t new_block;
1741+
ompi_comm_extended_cid_block_t new_block = {0};
17421742
bool i_am_leader = local_leader == local_group->grp_my_rank;
17431743
ompi_proc_t **rprocs;
17441744
uint64_t data[4];
@@ -1864,14 +1864,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
18641864
return rc;
18651865
}
18661866

1867-
/* will be using a communicator ID derived from the bridge communicator to save some time */
1868-
new_block.block_cid.cid_base = data[1];
1869-
new_block.block_cid.cid_sub.u64 = data[2];
1870-
new_block.block_nextsub = 0;
1871-
new_block.block_nexttag = 0;
1872-
new_block.block_level = (int8_t) data[3];
1873-
1874-
rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, &new_block, false, OMPI_COMM_CID_GROUP_NEW);
1867+
rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, NULL, false, OMPI_COMM_CID_GROUP_NEW);
18751868
if ( OMPI_SUCCESS != rc ) {
18761869
OBJ_RELEASE(newcomp);
18771870
return rc;

ompi/communicator/comm_cid.c

Lines changed: 176 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -310,21 +310,16 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
310310
const void *arg0, const void *arg1, bool send_first, int mode,
311311
ompi_request_t **req)
312312
{
313-
pmix_info_t pinfo, *results = NULL;
313+
pmix_info_t *pinfo, *results = NULL;
314314
size_t nresults;
315-
opal_process_name_t *name_array = NULL;
316-
char *tag = NULL;
317-
size_t proc_count;
318-
size_t cid_base = 0;
315+
opal_process_name_t opal_proc_name;
319316
bool cid_base_set = false;
317+
char *tag = NULL;
318+
size_t proc_count = 0, rproc_count = 0, tproc_count = 0, cid_base = 0UL, ninfo;
320319
int rc, leader_rank;
321-
int ret = OMPI_SUCCESS;
322-
pmix_proc_t *procs = NULL;
323-
324-
rc = ompi_group_to_proc_name_array (newcomm->c_local_group, &name_array, &proc_count);
325-
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
326-
return rc;
327-
}
320+
pmix_proc_t *procs;
321+
void *grpinfo = NULL, *list = NULL;
322+
pmix_data_array_t darray;
328323

329324
switch (mode) {
330325
case OMPI_COMM_CID_GROUP_NEW:
@@ -341,15 +336,75 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
341336
break;
342337
}
343338

344-
PMIX_INFO_LOAD(&pinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL);
339+
grpinfo = PMIx_Info_list_start();
340+
if (NULL == grpinfo) {
341+
rc = OMPI_ERR_OUT_OF_RESOURCE;
342+
goto fn_exit;
343+
}
344+
345+
rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL);
346+
if (PMIX_SUCCESS != rc) {
347+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
348+
rc = OMPI_ERR_OUT_OF_RESOURCE;
349+
goto fn_exit;
350+
}
351+
352+
list = PMIx_Info_list_start();
353+
354+
size_t c_index = (size_t)newcomm->c_index;
355+
rc = PMIx_Info_list_add(list, PMIX_GROUP_LOCAL_CID, &c_index, PMIX_SIZE);
356+
if (PMIX_SUCCESS != rc) {
357+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
358+
rc = OMPI_ERR_OUT_OF_RESOURCE;
359+
goto fn_exit;
360+
}
361+
362+
rc = PMIx_Info_list_convert(list, &darray);
363+
if (PMIX_SUCCESS != rc) {
364+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__));
365+
rc = OMPI_ERR_OUT_OF_RESOURCE;
366+
goto fn_exit;
367+
}
368+
rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_INFO, &darray, PMIX_DATA_ARRAY);
369+
PMIX_DATA_ARRAY_DESTRUCT(&darray);
370+
if (PMIX_SUCCESS != rc) {
371+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
372+
rc = OMPI_ERR_OUT_OF_RESOURCE;
373+
goto fn_exit;
374+
}
375+
376+
rc = PMIx_Info_list_convert(grpinfo, &darray);
377+
if (PMIX_SUCCESS != rc) {
378+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__));
379+
rc = OMPI_ERR_OUT_OF_RESOURCE;
380+
goto fn_exit;
381+
}
382+
383+
pinfo = (pmix_info_t*)darray.array;
384+
ninfo = darray.size;
385+
386+
proc_count = newcomm->c_local_group->grp_proc_count;
387+
if ( OMPI_COMM_IS_INTER (newcomm) ){
388+
rproc_count = newcomm->c_remote_group->grp_proc_count;
389+
}
390+
391+
PMIX_PROC_CREATE(procs, proc_count + rproc_count);
345392

346-
PMIX_PROC_CREATE(procs, proc_count);
347393
for (size_t i = 0 ; i < proc_count; ++i) {
348-
OPAL_PMIX_CONVERT_NAME(&procs[i],&name_array[i]);
394+
opal_proc_name = ompi_group_get_proc_name(newcomm->c_local_group, i);
395+
OPAL_PMIX_CONVERT_NAME(&procs[i],&opal_proc_name);
396+
}
397+
for (size_t i = 0; i < rproc_count; ++i) {
398+
opal_proc_name = ompi_group_get_proc_name(newcomm->c_remote_group, i);
399+
OPAL_PMIX_CONVERT_NAME(&procs[proc_count+i],&opal_proc_name);
349400
}
350401

351-
rc = PMIx_Group_construct(tag, procs, proc_count, &pinfo, 1, &results, &nresults);
352-
PMIX_INFO_DESTRUCT(&pinfo);
402+
tproc_count = proc_count + rproc_count;
403+
404+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n",
405+
tag, tproc_count, ninfo, cid_base));
406+
rc = PMIx_Group_construct(tag, procs, tproc_count, pinfo, ninfo, &results, &nresults);
407+
PMIX_DATA_ARRAY_DESTRUCT(&darray);
353408
if(PMIX_SUCCESS != rc) {
354409
char msg_string[1024];
355410
switch (rc) {
@@ -361,7 +416,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
361416
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups",
362417
msg_string);
363418

364-
ret = MPI_ERR_UNSUPPORTED_OPERATION;
419+
rc = MPI_ERR_UNSUPPORTED_OPERATION;
365420
break;
366421
case PMIX_ERR_NOT_SUPPORTED:
367422
sprintf(msg_string,"PMIx server does not support PMIx Group operations");
@@ -370,10 +425,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
370425
true,
371426
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups",
372427
msg_string);
373-
ret = MPI_ERR_UNSUPPORTED_OPERATION;
428+
rc = MPI_ERR_UNSUPPORTED_OPERATION;
374429
break;
375430
default:
376-
ret = opal_pmix_convert_status(rc);
431+
rc = opal_pmix_convert_status(rc);
377432
break;
378433
}
379434
goto fn_exit;
@@ -383,23 +438,28 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
383438
if (PMIX_CHECK_KEY(&results[i], PMIX_GROUP_CONTEXT_ID)) {
384439
PMIX_VALUE_GET_NUMBER(rc, &results[i].value, cid_base, size_t);
385440
if(PMIX_SUCCESS != rc) {
386-
ret = opal_pmix_convert_status(rc);
441+
rc = opal_pmix_convert_status(rc);
387442
goto fn_exit;
388443
}
389444
cid_base_set = true;
390445
break;
391446
}
392447
}
393448

449+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n",
450+
tag, tproc_count, ninfo, cid_base));
451+
452+
/* destruct the group */
394453
rc = PMIx_Group_destruct (tag, NULL, 0);
395454
if(PMIX_SUCCESS != rc) {
396-
ret = opal_pmix_convert_status(rc);
455+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_destruct failed %s", PMIx_Error_string(rc)));
456+
rc = opal_pmix_convert_status(rc);
397457
goto fn_exit;
398458
}
399459

400460
if (!cid_base_set) {
401461
opal_show_help("help-comm.txt", "cid-base-not-set", true);
402-
ret = OMPI_ERROR;
462+
rc = OMPI_ERROR;
403463
goto fn_exit;
404464
}
405465

@@ -412,16 +472,19 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
412472
}
413473

414474
if(NULL != procs) {
415-
PMIX_PROC_FREE(procs, proc_count);
475+
PMIX_PROC_FREE(procs, tproc_count);
416476
procs = NULL;
417477
}
418478

419-
if(NULL != name_array) {
420-
free (name_array);
421-
name_array = NULL;
479+
if (NULL != grpinfo) {
480+
PMIx_Info_list_release(grpinfo);
422481
}
423482

424-
return ret;
483+
if (NULL != list) {
484+
PMIx_Info_list_release(list);
485+
}
486+
487+
return rc;
425488
}
426489

427490
static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm,
@@ -446,6 +509,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
446509
block = &comm->c_contextidb;
447510
}
448511

512+
for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) {
513+
bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm);
514+
if (true == flag) {
515+
newcomm->c_index = i;
516+
break;
517+
}
518+
}
519+
assert(newcomm->c_index > 2);
520+
449521
if (NULL == arg1) {
450522
if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode ||
451523
!ompi_comm_extended_cid_block_available (&comm->c_contextidb)) {
@@ -468,14 +540,6 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
468540
(void) ompi_comm_extended_cid_block_new (block, &newcomm->c_contextidb, is_new_block);
469541
}
470542

471-
for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) {
472-
bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm);
473-
if (true == flag) {
474-
newcomm->c_index = i;
475-
break;
476-
}
477-
}
478-
479543
newcomm->c_contextid = newcomm->c_contextidb.block_cid;
480544

481545
opal_hash_table_set_value_ptr (&ompi_comm_hash, &newcomm->c_contextid,
@@ -502,7 +566,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
502566
functions but the pml does not support these functions so return not supported */
503567
if (NULL == comm) {
504568
char msg_string[1024];
505-
sprintf(msg_string,"The PML being used - %s - does not support MPI sessions related features",
569+
sprintf(msg_string,"The PML being used - %s - does not support MPI sessions related features",
506570
mca_pml_base_selected_component.pmlm_version.mca_component_name);
507571
opal_show_help("help-comm.txt",
508572
"MPI function not supported",
@@ -886,6 +950,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
886950
ompi_comm_cid_context_t *context;
887951
ompi_comm_request_t *request;
888952
ompi_request_t *subreq;
953+
uint32_t comm_size;
889954
int ret = 0;
890955

891956
/* the caller should not pass NULL for comm (it may be the same as *newcomm) */
@@ -907,6 +972,25 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
907972

908973
request->context = &context->super;
909974

975+
/* Prep communicator for handling remote cids if needed */
976+
977+
if (!OMPI_COMM_IS_GLOBAL_INDEX(*newcomm)) {
978+
if (OMPI_COMM_IS_INTER(*newcomm)) {
979+
comm_size = ompi_comm_remote_size(*newcomm);
980+
} else {
981+
comm_size = ompi_comm_size(*newcomm);
982+
}
983+
984+
(*newcomm)->c_index_vec = (uint32_t *)calloc(comm_size, sizeof(uint32_t));
985+
if (NULL == (*newcomm)->c_index_vec) {
986+
return OMPI_ERR_OUT_OF_RESOURCE;
987+
}
988+
989+
if (OMPI_COMM_IS_INTRA(*newcomm)) {
990+
(*newcomm)->c_index_vec[(*newcomm)->c_my_rank] = (*newcomm)->c_index;
991+
}
992+
}
993+
910994
if (MPI_UNDEFINED != (*newcomm)->c_local_group->grp_my_rank) {
911995
/* Initialize the PML stuff in the newcomm */
912996
if ( OMPI_SUCCESS != (ret = MCA_PML_CALL(add_comm(*newcomm))) ) {
@@ -963,6 +1047,61 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
9631047
return rc;
9641048
}
9651049

1050+
int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uint32_t *remote_cid)
1051+
{
1052+
ompi_proc_t *ompi_proc;
1053+
pmix_proc_t pmix_proc;
1054+
pmix_info_t tinfo[2];
1055+
pmix_value_t *val = NULL;
1056+
ompi_comm_extended_cid_t excid;
1057+
int rc = OMPI_SUCCESS;
1058+
size_t remote_cid64;
1059+
1060+
assert(NULL != remote_cid);
1061+
1062+
ompi_proc = ompi_comm_peer_lookup(comm, dest);
1063+
OPAL_PMIX_CONVERT_NAME(&pmix_proc, &ompi_proc->super.proc_name);
1064+
1065+
PMIx_Info_construct(&tinfo[0]);
1066+
PMIX_INFO_LOAD(&tinfo[0], PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32);
1067+
1068+
excid = ompi_comm_get_extended_cid(comm);
1069+
1070+
PMIX_INFO_CONSTRUCT(&tinfo[1]);
1071+
PMIX_INFO_LOAD(&tinfo[1], PMIX_GROUP_CONTEXT_ID, &excid.cid_base, PMIX_SIZE);
1072+
PMIX_INFO_SET_QUALIFIER(&tinfo[1]);
1073+
if (PMIX_SUCCESS != (rc = PMIx_Get(&pmix_proc, PMIX_GROUP_LOCAL_CID, tinfo, 2, &val))) {
1074+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s", excid.cid_base, PMIx_Error_string(rc)));
1075+
rc = OMPI_ERR_NOT_FOUND;
1076+
goto done;
1077+
}
1078+
1079+
if (NULL == val) {
1080+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL"));
1081+
rc = OMPI_ERR_NOT_FOUND;
1082+
goto done;
1083+
}
1084+
1085+
if (val->type != PMIX_SIZE) {
1086+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch"));
1087+
rc = OMPI_ERR_TYPE_MISMATCH;
1088+
goto done;
1089+
}
1090+
1091+
PMIX_VALUE_GET_NUMBER(rc, val, remote_cid64, size_t);
1092+
rc = OMPI_SUCCESS;
1093+
*remote_cid = (uint32_t)remote_cid64;
1094+
comm->c_index_vec[dest] = (uint32_t)remote_cid64;
1095+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld", *remote_cid, excid.cid_base));
1096+
1097+
done:
1098+
if (NULL != val) {
1099+
PMIX_VALUE_RELEASE(val);
1100+
}
1101+
1102+
return rc;
1103+
}
1104+
9661105
static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request)
9671106
{
9681107
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;

0 commit comments

Comments
 (0)