@@ -310,21 +310,16 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
310
310
const void * arg0 , const void * arg1 , bool send_first , int mode ,
311
311
ompi_request_t * * req )
312
312
{
313
- pmix_info_t pinfo , * results = NULL ;
313
+ pmix_info_t * pinfo , * results = NULL ;
314
314
size_t nresults ;
315
- opal_process_name_t * name_array = NULL ;
316
- char * tag = NULL ;
317
- size_t proc_count ;
318
- size_t cid_base = 0 ;
315
+ opal_process_name_t opal_proc_name ;
319
316
bool cid_base_set = false;
317
+ char * tag = NULL ;
318
+ size_t proc_count = 0 , rproc_count = 0 , cid_base = 0UL , ninfo ;
320
319
int rc , leader_rank ;
321
- int ret = OMPI_SUCCESS ;
322
- pmix_proc_t * procs = NULL ;
323
-
324
- rc = ompi_group_to_proc_name_array (newcomm -> c_local_group , & name_array , & proc_count );
325
- if (OPAL_UNLIKELY (OMPI_SUCCESS != rc )) {
326
- return rc ;
327
- }
320
+ pmix_proc_t * procs ;
321
+ void * grpinfo = NULL , * list = NULL ;
322
+ pmix_data_array_t darray ;
328
323
329
324
switch (mode ) {
330
325
case OMPI_COMM_CID_GROUP_NEW :
@@ -341,15 +336,71 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
341
336
break ;
342
337
}
343
338
344
- PMIX_INFO_LOAD (& pinfo , PMIX_GROUP_ASSIGN_CONTEXT_ID , NULL , PMIX_BOOL );
339
+ grpinfo = PMIx_Info_list_start ();
340
+ if (NULL == grpinfo ) {
341
+ return OMPI_ERR_OUT_OF_RESOURCE ;
342
+ }
343
+
344
+ rc = PMIx_Info_list_add (grpinfo , PMIX_GROUP_ASSIGN_CONTEXT_ID , NULL , PMIX_BOOL );
345
+ if (PMIX_SUCCESS != rc ) {
346
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
347
+ return OMPI_ERR_OUT_OF_RESOURCE ;
348
+ }
349
+
350
+ list = PMIx_Info_list_start ();
351
+
352
+ size_t c_index = (size_t )newcomm -> c_index ;
353
+ rc = PMIx_Info_list_add (list , PMIX_GROUP_LOCAL_CID , & c_index , PMIX_SIZE );
354
+ if (PMIX_SUCCESS != rc ) {
355
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
356
+ return OMPI_ERR_OUT_OF_RESOURCE ;
357
+ }
358
+
359
+ rc = PMIx_Info_list_convert (list , & darray );
360
+ if (PMIX_SUCCESS != rc ) {
361
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_convert failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
362
+ return OMPI_ERR_OUT_OF_RESOURCE ;
363
+ }
364
+ rc = PMIx_Info_list_add (grpinfo , PMIX_GROUP_INFO , & darray , PMIX_DATA_ARRAY );
365
+ if (PMIX_SUCCESS != rc ) {
366
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
367
+ return OMPI_ERR_OUT_OF_RESOURCE ;
368
+ }
369
+ PMIx_Info_list_release (list );
370
+ PMIX_DATA_ARRAY_DESTRUCT (& darray );
371
+
372
+
373
+ rc = PMIx_Info_list_convert (grpinfo , & darray );
374
+ if (PMIX_SUCCESS != rc ) {
375
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_convert failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
376
+ return OMPI_ERR_OUT_OF_RESOURCE ;
377
+ }
378
+
379
+ pinfo = (pmix_info_t * )darray .array ;
380
+ ninfo = darray .size ;
381
+ PMIx_Info_list_release (grpinfo );
382
+
383
+ proc_count = newcomm -> c_local_group -> grp_proc_count ;
384
+ if ( OMPI_COMM_IS_INTER (newcomm ) ){
385
+ rproc_count = newcomm -> c_remote_group -> grp_proc_count ;
386
+ }
387
+
388
+ PMIX_PROC_CREATE (procs , proc_count + rproc_count );
345
389
346
- PMIX_PROC_CREATE (procs , proc_count );
347
390
for (size_t i = 0 ; i < proc_count ; ++ i ) {
348
- OPAL_PMIX_CONVERT_NAME (& procs [i ],& name_array [i ]);
391
+ opal_proc_name = ompi_group_get_proc_name (newcomm -> c_local_group , i );
392
+ OPAL_PMIX_CONVERT_NAME (& procs [i ],& opal_proc_name );
393
+ }
394
+ for (size_t i = 0 ; i < rproc_count ; ++ i ) {
395
+ opal_proc_name = ompi_group_get_proc_name (newcomm -> c_remote_group , i );
396
+ OPAL_PMIX_CONVERT_NAME (& procs [proc_count + i ],& opal_proc_name );
349
397
}
350
398
351
- rc = PMIx_Group_construct (tag , procs , proc_count , & pinfo , 1 , & results , & nresults );
352
- PMIX_INFO_DESTRUCT (& pinfo );
399
+
400
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n" ,
401
+ tag , proc_count + rproc_count , ninfo , cid_base ));
402
+ rc = PMIx_Group_construct (tag , procs , proc_count + rproc_count , pinfo , ninfo , & results , & nresults );
403
+ PMIX_DATA_ARRAY_DESTRUCT (& darray );
353
404
if (PMIX_SUCCESS != rc ) {
354
405
char msg_string [1024 ];
355
406
switch (rc ) {
@@ -361,7 +412,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
361
412
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups" ,
362
413
msg_string );
363
414
364
- ret = MPI_ERR_UNSUPPORTED_OPERATION ;
415
+ rc = MPI_ERR_UNSUPPORTED_OPERATION ;
365
416
break ;
366
417
case PMIX_ERR_NOT_SUPPORTED :
367
418
sprintf (msg_string ,"PMIx server does not support PMIx Group operations" );
@@ -370,10 +421,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
370
421
true,
371
422
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups" ,
372
423
msg_string );
373
- ret = MPI_ERR_UNSUPPORTED_OPERATION ;
424
+ rc = MPI_ERR_UNSUPPORTED_OPERATION ;
374
425
break ;
375
426
default :
376
- ret = opal_pmix_convert_status (rc );
427
+ rc = opal_pmix_convert_status (rc );
377
428
break ;
378
429
}
379
430
goto fn_exit ;
@@ -383,23 +434,28 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
383
434
if (PMIX_CHECK_KEY (& results [i ], PMIX_GROUP_CONTEXT_ID )) {
384
435
PMIX_VALUE_GET_NUMBER (rc , & results [i ].value , cid_base , size_t );
385
436
if (PMIX_SUCCESS != rc ) {
386
- ret = opal_pmix_convert_status (rc );
437
+ rc = opal_pmix_convert_status (rc );
387
438
goto fn_exit ;
388
439
}
389
440
cid_base_set = true;
390
441
break ;
391
442
}
392
443
}
393
444
445
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n" ,
446
+ tag , proc_count + rproc_count , ninfo , cid_base ));
447
+
448
+ /* destruct the group */
394
449
rc = PMIx_Group_destruct (tag , NULL , 0 );
395
450
if (PMIX_SUCCESS != rc ) {
396
- ret = opal_pmix_convert_status (rc );
451
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Group_destruct failed %s" , PMIx_Error_string (rc )));
452
+ rc = opal_pmix_convert_status (rc );
397
453
goto fn_exit ;
398
454
}
399
455
400
456
if (!cid_base_set ) {
401
457
opal_show_help ("help-comm.txt" , "cid-base-not-set" , true);
402
- ret = OMPI_ERROR ;
458
+ rc = OMPI_ERROR ;
403
459
goto fn_exit ;
404
460
}
405
461
@@ -416,12 +472,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
416
472
procs = NULL ;
417
473
}
418
474
419
- if (NULL != name_array ) {
420
- free (name_array );
421
- name_array = NULL ;
422
- }
423
-
424
- return ret ;
475
+ return rc ;
425
476
}
426
477
427
478
static int ompi_comm_nextcid_ext_nb (ompi_communicator_t * newcomm , ompi_communicator_t * comm ,
@@ -446,6 +497,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
446
497
block = & comm -> c_contextidb ;
447
498
}
448
499
500
+ for (unsigned int i = ompi_mpi_communicators .lowest_free ; i < mca_pml .pml_max_contextid ; ++ i ) {
501
+ bool flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i , newcomm );
502
+ if (true == flag ) {
503
+ newcomm -> c_index = i ;
504
+ break ;
505
+ }
506
+ }
507
+ assert (newcomm -> c_index > 2 );
508
+
449
509
if (NULL == arg1 ) {
450
510
if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode ||
451
511
!ompi_comm_extended_cid_block_available (& comm -> c_contextidb )) {
@@ -464,18 +524,11 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
464
524
is_new_block = true;
465
525
}
466
526
527
+
467
528
if (block != & newcomm -> c_contextidb ) {
468
529
(void ) ompi_comm_extended_cid_block_new (block , & newcomm -> c_contextidb , is_new_block );
469
530
}
470
531
471
- for (unsigned int i = ompi_mpi_communicators .lowest_free ; i < mca_pml .pml_max_contextid ; ++ i ) {
472
- bool flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i , newcomm );
473
- if (true == flag ) {
474
- newcomm -> c_index = i ;
475
- break ;
476
- }
477
- }
478
-
479
532
newcomm -> c_contextid = newcomm -> c_contextidb .block_cid ;
480
533
481
534
opal_hash_table_set_value_ptr (& ompi_comm_hash , & newcomm -> c_contextid ,
@@ -498,7 +551,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
498
551
499
552
/* old CID algorighm */
500
553
501
- /* if we got here and comm is NULL then that means the app is invoking MPI-4 Sessions or later
554
+ /* if we got here and comm is NULL then that means the app is invoking MPI-4 Sessions or later
502
555
functions but the pml does not support these functions so return not supported */
503
556
if (NULL == comm ) {
504
557
char msg_string [1024 ];
@@ -963,6 +1016,64 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
963
1016
return rc ;
964
1017
}
965
1018
1019
+ int ompi_comm_get_remote_cid (ompi_communicator_t * comm , int dest , uint32_t * remote_cid )
1020
+ {
1021
+ ompi_proc_t * ompi_proc ;
1022
+ pmix_proc_t pmix_proc ;
1023
+ pmix_info_t tinfo [2 ];
1024
+ pmix_value_t * val = NULL ;
1025
+ ompi_comm_extended_cid_t excid ;
1026
+ int rc = OMPI_SUCCESS ;
1027
+ size_t remote_cid64 ;
1028
+
1029
+ assert (NULL != remote_cid );
1030
+
1031
+ if (OMPI_COMM_IS_GLOBAL_INDEX (comm )) {
1032
+ * remote_cid = comm -> c_index ;
1033
+ } else {
1034
+ ompi_proc = ompi_comm_peer_lookup (comm , dest );
1035
+ OPAL_PMIX_CONVERT_NAME (& pmix_proc , & ompi_proc -> super .proc_name );
1036
+
1037
+ PMIx_Info_construct (& tinfo [0 ]);
1038
+ PMIX_INFO_LOAD (& tinfo [0 ], PMIX_TIMEOUT , & ompi_pmix_connect_timeout , PMIX_UINT32 );
1039
+
1040
+ excid = ompi_comm_get_extended_cid (comm );
1041
+
1042
+ PMIX_INFO_CONSTRUCT (& tinfo [1 ]);
1043
+ PMIX_INFO_LOAD (& tinfo [1 ], PMIX_GROUP_CONTEXT_ID , & excid .cid_base , PMIX_SIZE );
1044
+ PMIX_INFO_SET_QUALIFIER (& tinfo [1 ]);
1045
+ if (PMIX_SUCCESS != (rc = PMIx_Get (& pmix_proc , PMIX_GROUP_LOCAL_CID , tinfo , 2 , & val ))) {
1046
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s" , excid .cid_base , PMIx_Error_string (rc )));
1047
+ }
1048
+
1049
+ if (NULL == val ) {
1050
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL" ));
1051
+ rc = OMPI_ERR_NOT_FOUND ;
1052
+ goto done ;
1053
+ }
1054
+
1055
+ if (val -> type != PMIX_SIZE ) {
1056
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch" ));
1057
+ rc = OMPI_ERR_TYPE_MISMATCH ;
1058
+ goto done ;
1059
+ }
1060
+
1061
+ if (PMIX_SUCCESS == rc ) {
1062
+ PMIX_VALUE_GET_NUMBER (rc , val , remote_cid64 , size_t );
1063
+ rc = OMPI_SUCCESS ;
1064
+ * remote_cid = (uint32_t )remote_cid64 ;
1065
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld" , * remote_cid , excid .cid_base ));
1066
+ }
1067
+ }
1068
+
1069
+ done :
1070
+ if (NULL != val ) {
1071
+ PMIX_VALUE_RELEASE (val );
1072
+ }
1073
+
1074
+ return rc ;
1075
+ }
1076
+
966
1077
static int ompi_comm_activate_nb_complete (ompi_comm_request_t * request )
967
1078
{
968
1079
ompi_comm_cid_context_t * context = (ompi_comm_cid_context_t * ) request -> context ;
0 commit comments