@@ -310,21 +310,16 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
310
310
const void * arg0 , const void * arg1 , bool send_first , int mode ,
311
311
ompi_request_t * * req )
312
312
{
313
- pmix_info_t pinfo , * results = NULL ;
313
+ pmix_info_t * pinfo , * results = NULL ;
314
314
size_t nresults ;
315
- opal_process_name_t * name_array = NULL ;
316
- char * tag = NULL ;
317
- size_t proc_count ;
318
- size_t cid_base = 0 ;
315
+ opal_process_name_t opal_proc_name ;
319
316
bool cid_base_set = false;
317
+ char * tag = NULL ;
318
+ size_t proc_count = 0 , rproc_count = 0 , tproc_count = 0 , cid_base = 0UL , ninfo ;
320
319
int rc , leader_rank ;
321
- int ret = OMPI_SUCCESS ;
322
- pmix_proc_t * procs = NULL ;
323
-
324
- rc = ompi_group_to_proc_name_array (newcomm -> c_local_group , & name_array , & proc_count );
325
- if (OPAL_UNLIKELY (OMPI_SUCCESS != rc )) {
326
- return rc ;
327
- }
320
+ pmix_proc_t * procs ;
321
+ void * grpinfo = NULL , * list = NULL ;
322
+ pmix_data_array_t darray ;
328
323
329
324
switch (mode ) {
330
325
case OMPI_COMM_CID_GROUP_NEW :
@@ -341,15 +336,75 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
341
336
break ;
342
337
}
343
338
344
- PMIX_INFO_LOAD (& pinfo , PMIX_GROUP_ASSIGN_CONTEXT_ID , NULL , PMIX_BOOL );
339
+ grpinfo = PMIx_Info_list_start ();
340
+ if (NULL == grpinfo ) {
341
+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
342
+ goto fn_exit ;
343
+ }
344
+
345
+ rc = PMIx_Info_list_add (grpinfo , PMIX_GROUP_ASSIGN_CONTEXT_ID , NULL , PMIX_BOOL );
346
+ if (PMIX_SUCCESS != rc ) {
347
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
348
+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
349
+ goto fn_exit ;
350
+ }
351
+
352
+ list = PMIx_Info_list_start ();
353
+
354
+ size_t c_index = (size_t )newcomm -> c_index ;
355
+ rc = PMIx_Info_list_add (list , PMIX_GROUP_LOCAL_CID , & c_index , PMIX_SIZE );
356
+ if (PMIX_SUCCESS != rc ) {
357
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
358
+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
359
+ goto fn_exit ;
360
+ }
361
+
362
+ rc = PMIx_Info_list_convert (list , & darray );
363
+ if (PMIX_SUCCESS != rc ) {
364
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_convert failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
365
+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
366
+ goto fn_exit ;
367
+ }
368
+ rc = PMIx_Info_list_add (grpinfo , PMIX_GROUP_INFO , & darray , PMIX_DATA_ARRAY );
369
+ PMIX_DATA_ARRAY_DESTRUCT (& darray );
370
+ if (PMIX_SUCCESS != rc ) {
371
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
372
+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
373
+ goto fn_exit ;
374
+ }
375
+
376
+ rc = PMIx_Info_list_convert (grpinfo , & darray );
377
+ if (PMIX_SUCCESS != rc ) {
378
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_convert failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
379
+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
380
+ goto fn_exit ;
381
+ }
382
+
383
+ pinfo = (pmix_info_t * )darray .array ;
384
+ ninfo = darray .size ;
385
+
386
+ proc_count = newcomm -> c_local_group -> grp_proc_count ;
387
+ if ( OMPI_COMM_IS_INTER (newcomm ) ){
388
+ rproc_count = newcomm -> c_remote_group -> grp_proc_count ;
389
+ }
390
+
391
+ PMIX_PROC_CREATE (procs , proc_count + rproc_count );
345
392
346
- PMIX_PROC_CREATE (procs , proc_count );
347
393
for (size_t i = 0 ; i < proc_count ; ++ i ) {
348
- OPAL_PMIX_CONVERT_NAME (& procs [i ],& name_array [i ]);
394
+ opal_proc_name = ompi_group_get_proc_name (newcomm -> c_local_group , i );
395
+ OPAL_PMIX_CONVERT_NAME (& procs [i ],& opal_proc_name );
396
+ }
397
+ for (size_t i = 0 ; i < rproc_count ; ++ i ) {
398
+ opal_proc_name = ompi_group_get_proc_name (newcomm -> c_remote_group , i );
399
+ OPAL_PMIX_CONVERT_NAME (& procs [proc_count + i ],& opal_proc_name );
349
400
}
350
401
351
- rc = PMIx_Group_construct (tag , procs , proc_count , & pinfo , 1 , & results , & nresults );
352
- PMIX_INFO_DESTRUCT (& pinfo );
402
+ tproc_count = proc_count + rproc_count ;
403
+
404
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n" ,
405
+ tag , tproc_count , ninfo , cid_base ));
406
+ rc = PMIx_Group_construct (tag , procs , tproc_count , pinfo , ninfo , & results , & nresults );
407
+ PMIX_DATA_ARRAY_DESTRUCT (& darray );
353
408
if (PMIX_SUCCESS != rc ) {
354
409
char msg_string [1024 ];
355
410
switch (rc ) {
@@ -361,7 +416,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
361
416
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups" ,
362
417
msg_string );
363
418
364
- ret = MPI_ERR_UNSUPPORTED_OPERATION ;
419
+ rc = MPI_ERR_UNSUPPORTED_OPERATION ;
365
420
break ;
366
421
case PMIX_ERR_NOT_SUPPORTED :
367
422
sprintf (msg_string ,"PMIx server does not support PMIx Group operations" );
@@ -370,10 +425,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
370
425
true,
371
426
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups" ,
372
427
msg_string );
373
- ret = MPI_ERR_UNSUPPORTED_OPERATION ;
428
+ rc = MPI_ERR_UNSUPPORTED_OPERATION ;
374
429
break ;
375
430
default :
376
- ret = opal_pmix_convert_status (rc );
431
+ rc = opal_pmix_convert_status (rc );
377
432
break ;
378
433
}
379
434
goto fn_exit ;
@@ -383,23 +438,28 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
383
438
if (PMIX_CHECK_KEY (& results [i ], PMIX_GROUP_CONTEXT_ID )) {
384
439
PMIX_VALUE_GET_NUMBER (rc , & results [i ].value , cid_base , size_t );
385
440
if (PMIX_SUCCESS != rc ) {
386
- ret = opal_pmix_convert_status (rc );
441
+ rc = opal_pmix_convert_status (rc );
387
442
goto fn_exit ;
388
443
}
389
444
cid_base_set = true;
390
445
break ;
391
446
}
392
447
}
393
448
449
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n" ,
450
+ tag , tproc_count , ninfo , cid_base ));
451
+
452
+ /* destruct the group */
394
453
rc = PMIx_Group_destruct (tag , NULL , 0 );
395
454
if (PMIX_SUCCESS != rc ) {
396
- ret = opal_pmix_convert_status (rc );
455
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Group_destruct failed %s" , PMIx_Error_string (rc )));
456
+ rc = opal_pmix_convert_status (rc );
397
457
goto fn_exit ;
398
458
}
399
459
400
460
if (!cid_base_set ) {
401
461
opal_show_help ("help-comm.txt" , "cid-base-not-set" , true);
402
- ret = OMPI_ERROR ;
462
+ rc = OMPI_ERROR ;
403
463
goto fn_exit ;
404
464
}
405
465
@@ -412,16 +472,19 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
412
472
}
413
473
414
474
if (NULL != procs ) {
415
- PMIX_PROC_FREE (procs , proc_count );
475
+ PMIX_PROC_FREE (procs , tproc_count );
416
476
procs = NULL ;
417
477
}
418
478
419
- if (NULL != name_array ) {
420
- free (name_array );
421
- name_array = NULL ;
479
+ if (NULL != grpinfo ) {
480
+ PMIx_Info_list_release (grpinfo );
422
481
}
423
482
424
- return ret ;
483
+ if (NULL != list ) {
484
+ PMIx_Info_list_release (list );
485
+ }
486
+
487
+ return rc ;
425
488
}
426
489
427
490
static int ompi_comm_nextcid_ext_nb (ompi_communicator_t * newcomm , ompi_communicator_t * comm ,
@@ -446,6 +509,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
446
509
block = & comm -> c_contextidb ;
447
510
}
448
511
512
+ for (unsigned int i = ompi_mpi_communicators .lowest_free ; i < mca_pml .pml_max_contextid ; ++ i ) {
513
+ bool flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i , newcomm );
514
+ if (true == flag ) {
515
+ newcomm -> c_index = i ;
516
+ break ;
517
+ }
518
+ }
519
+ assert (newcomm -> c_index > 2 );
520
+
449
521
if (NULL == arg1 ) {
450
522
if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode ||
451
523
!ompi_comm_extended_cid_block_available (& comm -> c_contextidb )) {
@@ -468,14 +540,6 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
468
540
(void ) ompi_comm_extended_cid_block_new (block , & newcomm -> c_contextidb , is_new_block );
469
541
}
470
542
471
- for (unsigned int i = ompi_mpi_communicators .lowest_free ; i < mca_pml .pml_max_contextid ; ++ i ) {
472
- bool flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i , newcomm );
473
- if (true == flag ) {
474
- newcomm -> c_index = i ;
475
- break ;
476
- }
477
- }
478
-
479
543
newcomm -> c_contextid = newcomm -> c_contextidb .block_cid ;
480
544
481
545
opal_hash_table_set_value_ptr (& ompi_comm_hash , & newcomm -> c_contextid ,
@@ -502,7 +566,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
502
566
functions but the pml does not support these functions so return not supported */
503
567
if (NULL == comm ) {
504
568
char msg_string [1024 ];
505
- sprintf (msg_string ,"The PML being used - %s - does not support MPI sessions related features" ,
569
+ sprintf (msg_string ,"The PML being used - %s - does not support MPI sessions related features" ,
506
570
mca_pml_base_selected_component .pmlm_version .mca_component_name );
507
571
opal_show_help ("help-comm.txt" ,
508
572
"MPI function not supported" ,
@@ -886,6 +950,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
886
950
ompi_comm_cid_context_t * context ;
887
951
ompi_comm_request_t * request ;
888
952
ompi_request_t * subreq ;
953
+ uint32_t comm_size ;
889
954
int ret = 0 ;
890
955
891
956
/* the caller should not pass NULL for comm (it may be the same as *newcomm) */
@@ -907,6 +972,25 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
907
972
908
973
request -> context = & context -> super ;
909
974
975
+ /* Prep communicator for handling remote cids if needed */
976
+
977
+ if (!OMPI_COMM_IS_GLOBAL_INDEX (* newcomm )) {
978
+ if (OMPI_COMM_IS_INTER (* newcomm )) {
979
+ comm_size = ompi_comm_remote_size (* newcomm );
980
+ } else {
981
+ comm_size = ompi_comm_size (* newcomm );
982
+ }
983
+
984
+ (* newcomm )-> c_index_vec = (uint32_t * )calloc (comm_size , sizeof (uint32_t ));
985
+ if (NULL == (* newcomm )-> c_index_vec ) {
986
+ return OMPI_ERR_OUT_OF_RESOURCE ;
987
+ }
988
+
989
+ if (OMPI_COMM_IS_INTRA (* newcomm )) {
990
+ (* newcomm )-> c_index_vec [(* newcomm )-> c_my_rank ] = (* newcomm )-> c_index ;
991
+ }
992
+ }
993
+
910
994
if (MPI_UNDEFINED != (* newcomm )-> c_local_group -> grp_my_rank ) {
911
995
/* Initialize the PML stuff in the newcomm */
912
996
if ( OMPI_SUCCESS != (ret = MCA_PML_CALL (add_comm (* newcomm ))) ) {
@@ -963,6 +1047,61 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
963
1047
return rc ;
964
1048
}
965
1049
1050
+ int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t * comm , int dest , uint32_t * remote_cid )
1051
+ {
1052
+ ompi_proc_t * ompi_proc ;
1053
+ pmix_proc_t pmix_proc ;
1054
+ pmix_info_t tinfo [2 ];
1055
+ pmix_value_t * val = NULL ;
1056
+ ompi_comm_extended_cid_t excid ;
1057
+ int rc = OMPI_SUCCESS ;
1058
+ size_t remote_cid64 ;
1059
+
1060
+ assert (NULL != remote_cid );
1061
+
1062
+ ompi_proc = ompi_comm_peer_lookup (comm , dest );
1063
+ OPAL_PMIX_CONVERT_NAME (& pmix_proc , & ompi_proc -> super .proc_name );
1064
+
1065
+ PMIx_Info_construct (& tinfo [0 ]);
1066
+ PMIX_INFO_LOAD (& tinfo [0 ], PMIX_TIMEOUT , & ompi_pmix_connect_timeout , PMIX_UINT32 );
1067
+
1068
+ excid = ompi_comm_get_extended_cid (comm );
1069
+
1070
+ PMIX_INFO_CONSTRUCT (& tinfo [1 ]);
1071
+ PMIX_INFO_LOAD (& tinfo [1 ], PMIX_GROUP_CONTEXT_ID , & excid .cid_base , PMIX_SIZE );
1072
+ PMIX_INFO_SET_QUALIFIER (& tinfo [1 ]);
1073
+ if (PMIX_SUCCESS != (rc = PMIx_Get (& pmix_proc , PMIX_GROUP_LOCAL_CID , tinfo , 2 , & val ))) {
1074
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s" , excid .cid_base , PMIx_Error_string (rc )));
1075
+ rc = OMPI_ERR_NOT_FOUND ;
1076
+ goto done ;
1077
+ }
1078
+
1079
+ if (NULL == val ) {
1080
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL" ));
1081
+ rc = OMPI_ERR_NOT_FOUND ;
1082
+ goto done ;
1083
+ }
1084
+
1085
+ if (val -> type != PMIX_SIZE ) {
1086
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch" ));
1087
+ rc = OMPI_ERR_TYPE_MISMATCH ;
1088
+ goto done ;
1089
+ }
1090
+
1091
+ PMIX_VALUE_GET_NUMBER (rc , val , remote_cid64 , size_t );
1092
+ rc = OMPI_SUCCESS ;
1093
+ * remote_cid = (uint32_t )remote_cid64 ;
1094
+ comm -> c_index_vec [dest ] = (uint32_t )remote_cid64 ;
1095
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld" , * remote_cid , excid .cid_base ));
1096
+
1097
+ done :
1098
+ if (NULL != val ) {
1099
+ PMIX_VALUE_RELEASE (val );
1100
+ }
1101
+
1102
+ return rc ;
1103
+ }
1104
+
966
1105
static int ompi_comm_activate_nb_complete (ompi_comm_request_t * request )
967
1106
{
968
1107
ompi_comm_cid_context_t * context = (ompi_comm_cid_context_t * ) request -> context ;
0 commit comments