@@ -310,22 +310,41 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
310
310
const void * arg0 , const void * arg1 , bool send_first , int mode ,
311
311
ompi_request_t * * req )
312
312
{
313
- pmix_info_t pinfo , * results = NULL ;
313
+ pmix_info_t * pinfo , * results = NULL ;
314
314
size_t nresults ;
315
- opal_process_name_t * name_array = NULL ;
316
- char * tag = NULL ;
317
- size_t proc_count ;
318
- size_t cid_base = 0 ;
315
+ opal_process_name_t * name_array , * rname_array , * tmp_name_array ;
319
316
bool cid_base_set = false;
317
+ char * tag = NULL ;
318
+ size_t proc_count , rproc_count , cid_base = 0UL , ninfo ;
320
319
int rc , leader_rank ;
321
- int ret = OMPI_SUCCESS ;
322
- pmix_proc_t * procs = NULL ;
320
+ pmix_proc_t * procs ;
321
+ void * grpinfo = NULL , * list = NULL ;
322
+ pmix_data_array_t darray ;
323
+ char tmp [PMIX_MAX_KEYLEN ];
323
324
324
325
rc = ompi_group_to_proc_name_array (newcomm -> c_local_group , & name_array , & proc_count );
325
326
if (OPAL_UNLIKELY (OMPI_SUCCESS != rc )) {
326
327
return rc ;
327
328
}
328
329
330
+ if ( OMPI_COMM_IS_INTER (newcomm ) ){
331
+ rc = ompi_group_to_proc_name_array (newcomm -> c_remote_group , & rname_array , & rproc_count );
332
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != rc )) {
333
+ free (name_array );
334
+ return rc ;
335
+ }
336
+ tmp_name_array = (opal_process_name_t * )realloc (name_array , (proc_count + rproc_count ) * sizeof (opal_process_name_t ));
337
+ if (NULL == tmp ) {
338
+ free (name_array );
339
+ free (rname_array );
340
+ return OMPI_ERR_OUT_OF_RESOURCE ;
341
+ }
342
+ name_array = tmp_name_array ;
343
+ memcpy (& name_array [proc_count ], rname_array , rproc_count * sizeof (opal_process_name_t ));
344
+ proc_count += rproc_count ;
345
+ free (rname_array );
346
+ }
347
+
329
348
switch (mode ) {
330
349
case OMPI_COMM_CID_GROUP_NEW :
331
350
tag = (char * ) arg0 ;
@@ -341,15 +360,58 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
341
360
break ;
342
361
}
343
362
344
- PMIX_INFO_LOAD (& pinfo , PMIX_GROUP_ASSIGN_CONTEXT_ID , NULL , PMIX_BOOL );
363
+ grpinfo = PMIx_Info_list_start ();
364
+ if (NULL == grpinfo ) {
365
+ return OMPI_ERR_OUT_OF_RESOURCE ;
366
+ }
367
+
368
+ rc = PMIx_Info_list_add (grpinfo , PMIX_GROUP_ASSIGN_CONTEXT_ID , NULL , PMIX_BOOL );
369
+ if (PMIX_SUCCESS != rc ) {
370
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
371
+ return OMPI_ERR_OUT_OF_RESOURCE ;
372
+ }
373
+
374
+ list = PMIx_Info_list_start ();
375
+
376
+ size_t c_index = (size_t )newcomm -> c_index ;
377
+ rc = PMIx_Info_list_add (list , PMIX_GROUP_LOCAL_CID , & c_index , PMIX_SIZE );
378
+ if (PMIX_SUCCESS != rc ) {
379
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
380
+ return OMPI_ERR_OUT_OF_RESOURCE ;
381
+ }
382
+
383
+ rc = PMIx_Info_list_convert (list , & darray );
384
+ if (PMIX_SUCCESS != rc ) {
385
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_convert failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
386
+ return OMPI_ERR_OUT_OF_RESOURCE ;
387
+ }
388
+ rc = PMIx_Info_list_add (grpinfo , PMIX_GROUP_INFO , & darray , PMIX_DATA_ARRAY );
389
+ if (PMIX_SUCCESS != rc ) {
390
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
391
+ return OMPI_ERR_OUT_OF_RESOURCE ;
392
+ }
393
+ PMIx_Info_list_release (list );
394
+ PMIX_DATA_ARRAY_DESTRUCT (& darray );
395
+
396
+
397
+ rc = PMIx_Info_list_convert (grpinfo , & darray );
398
+ if (PMIX_SUCCESS != rc ) {
399
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_convert failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
400
+ return OMPI_ERR_OUT_OF_RESOURCE ;
401
+ }
402
+
403
+ pinfo = (pmix_info_t * )darray .array ;
404
+ ninfo = darray .size ;
405
+ PMIx_Info_list_release (grpinfo );
345
406
346
407
PMIX_PROC_CREATE (procs , proc_count );
347
408
for (size_t i = 0 ; i < proc_count ; ++ i ) {
348
409
OPAL_PMIX_CONVERT_NAME (& procs [i ],& name_array [i ]);
349
410
}
350
411
351
- rc = PMIx_Group_construct (tag , procs , proc_count , & pinfo , 1 , & results , & nresults );
352
- PMIX_INFO_DESTRUCT (& pinfo );
412
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n" , tag , proc_count , ninfo , cid_base ));
413
+ rc = PMIx_Group_construct (tag , procs , proc_count , pinfo , ninfo , & results , & nresults );
414
+ PMIX_DATA_ARRAY_DESTRUCT (& darray );
353
415
if (PMIX_SUCCESS != rc ) {
354
416
char msg_string [1024 ];
355
417
switch (rc ) {
@@ -361,7 +423,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
361
423
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups" ,
362
424
msg_string );
363
425
364
- ret = MPI_ERR_UNSUPPORTED_OPERATION ;
426
+ rc = MPI_ERR_UNSUPPORTED_OPERATION ;
365
427
break ;
366
428
case PMIX_ERR_NOT_SUPPORTED :
367
429
sprintf (msg_string ,"PMIx server does not support PMIx Group operations" );
@@ -370,10 +432,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
370
432
true,
371
433
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups" ,
372
434
msg_string );
373
- ret = MPI_ERR_UNSUPPORTED_OPERATION ;
435
+ rc = MPI_ERR_UNSUPPORTED_OPERATION ;
374
436
break ;
375
437
default :
376
- ret = opal_pmix_convert_status (rc );
438
+ rc = opal_pmix_convert_status (rc );
377
439
break ;
378
440
}
379
441
goto fn_exit ;
@@ -383,23 +445,27 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
383
445
if (PMIX_CHECK_KEY (& results [i ], PMIX_GROUP_CONTEXT_ID )) {
384
446
PMIX_VALUE_GET_NUMBER (rc , & results [i ].value , cid_base , size_t );
385
447
if (PMIX_SUCCESS != rc ) {
386
- ret = opal_pmix_convert_status (rc );
448
+ rc = opal_pmix_convert_status (rc );
387
449
goto fn_exit ;
388
450
}
389
451
cid_base_set = true;
390
452
break ;
391
453
}
392
454
}
393
455
456
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n" , tag , proc_count , ninfo , cid_base ));
457
+
458
+ /* destruct the group */
394
459
rc = PMIx_Group_destruct (tag , NULL , 0 );
395
460
if (PMIX_SUCCESS != rc ) {
396
- ret = opal_pmix_convert_status (rc );
461
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Group_destruct failed %s" , PMIx_Error_string (rc )));
462
+ rc = opal_pmix_convert_status (rc );
397
463
goto fn_exit ;
398
464
}
399
465
400
466
if (!cid_base_set ) {
401
467
opal_show_help ("help-comm.txt" , "cid-base-not-set" , true);
402
- ret = OMPI_ERROR ;
468
+ rc = OMPI_ERROR ;
403
469
goto fn_exit ;
404
470
}
405
471
@@ -421,7 +487,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
421
487
name_array = NULL ;
422
488
}
423
489
424
- return ret ;
490
+ return rc ;
425
491
}
426
492
427
493
static int ompi_comm_nextcid_ext_nb (ompi_communicator_t * newcomm , ompi_communicator_t * comm ,
@@ -446,6 +512,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
446
512
block = & comm -> c_contextidb ;
447
513
}
448
514
515
+ for (unsigned int i = ompi_mpi_communicators .lowest_free ; i < mca_pml .pml_max_contextid ; ++ i ) {
516
+ bool flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i , newcomm );
517
+ if (true == flag ) {
518
+ newcomm -> c_index = i ;
519
+ break ;
520
+ }
521
+ }
522
+ assert (newcomm -> c_index > 2 );
523
+
449
524
if (NULL == arg1 ) {
450
525
if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode ||
451
526
!ompi_comm_extended_cid_block_available (& comm -> c_contextidb )) {
@@ -464,18 +539,11 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
464
539
is_new_block = true;
465
540
}
466
541
542
+
467
543
if (block != & newcomm -> c_contextidb ) {
468
544
(void ) ompi_comm_extended_cid_block_new (block , & newcomm -> c_contextidb , is_new_block );
469
545
}
470
546
471
- for (unsigned int i = ompi_mpi_communicators .lowest_free ; i < mca_pml .pml_max_contextid ; ++ i ) {
472
- bool flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i , newcomm );
473
- if (true == flag ) {
474
- newcomm -> c_index = i ;
475
- break ;
476
- }
477
- }
478
-
479
547
newcomm -> c_contextid = newcomm -> c_contextidb .block_cid ;
480
548
481
549
opal_hash_table_set_value_ptr (& ompi_comm_hash , & newcomm -> c_contextid ,
@@ -498,7 +566,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
498
566
499
567
/* old CID algorighm */
500
568
501
- /* if we got here and comm is NULL then that means the app is invoking MPI-4 Sessions or later
569
+ /* if we got here and comm is NULL then that means the app is invoking MPI-4 Sessions or later
502
570
functions but the pml does not support these functions so return not supported */
503
571
if (NULL == comm ) {
504
572
char msg_string [1024 ];
@@ -963,6 +1031,64 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
963
1031
return rc ;
964
1032
}
965
1033
1034
+ int ompi_comm_get_remote_cid (ompi_communicator_t * comm , int dest , uint32_t * remote_cid )
1035
+ {
1036
+ ompi_proc_t * ompi_proc ;
1037
+ pmix_proc_t pmix_proc ;
1038
+ pmix_info_t tinfo [2 ];
1039
+ pmix_value_t * val = NULL ;
1040
+ ompi_comm_extended_cid_t excid ;
1041
+ int rc = OMPI_SUCCESS ;
1042
+ size_t remote_cid64 ;
1043
+
1044
+ assert (NULL != remote_cid );
1045
+
1046
+ if (OMPI_COMM_IS_GLOBAL_INDEX (comm )) {
1047
+ * remote_cid = comm -> c_index ;
1048
+ } else {
1049
+ ompi_proc = ompi_comm_peer_lookup (comm , dest );
1050
+ OPAL_PMIX_CONVERT_NAME (& pmix_proc , & ompi_proc -> super .proc_name );
1051
+
1052
+ PMIx_Info_construct (& tinfo [0 ]);
1053
+ PMIX_INFO_LOAD (& tinfo [0 ], PMIX_TIMEOUT , & ompi_pmix_connect_timeout , PMIX_UINT32 );
1054
+
1055
+ excid = ompi_comm_get_extended_cid (comm );
1056
+
1057
+ PMIX_INFO_CONSTRUCT (& tinfo [1 ]);
1058
+ PMIX_INFO_LOAD (& tinfo [1 ], PMIX_GROUP_CONTEXT_ID , & excid .cid_base , PMIX_SIZE );
1059
+ PMIX_INFO_SET_QUALIFIER (& tinfo [1 ]);
1060
+ if (PMIX_SUCCESS != (rc = PMIx_Get (& pmix_proc , PMIX_GROUP_LOCAL_CID , tinfo , 2 , & val ))) {
1061
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s" , excid .cid_base , PMIx_Error_string (rc )));
1062
+ }
1063
+
1064
+ if (NULL == val ) {
1065
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL" ));
1066
+ rc = OMPI_ERR_NOT_FOUND ;
1067
+ goto done ;
1068
+ }
1069
+
1070
+ if (val -> type != PMIX_SIZE ) {
1071
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch" ));
1072
+ rc = OMPI_ERR_TYPE_MISMATCH ;
1073
+ goto done ;
1074
+ }
1075
+
1076
+ if (PMIX_SUCCESS == rc ) {
1077
+ PMIX_VALUE_GET_NUMBER (rc , val , remote_cid64 , size_t );
1078
+ rc = OMPI_SUCCESS ;
1079
+ * remote_cid = (uint32_t )remote_cid64 ;
1080
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld" , * remote_cid , excid .cid_base ));
1081
+ }
1082
+ }
1083
+
1084
+ done :
1085
+ if (NULL != val ) {
1086
+ PMIX_VALUE_RELEASE (val );
1087
+ }
1088
+
1089
+ return rc ;
1090
+ }
1091
+
966
1092
static int ompi_comm_activate_nb_complete (ompi_comm_request_t * request )
967
1093
{
968
1094
ompi_comm_cid_context_t * context = (ompi_comm_cid_context_t * ) request -> context ;
0 commit comments