@@ -1091,7 +1091,8 @@ int ompi_instance_get_num_psets (ompi_instance_t *instance, int *npset_names)
1091
1091
1092
1092
int ompi_instance_get_nth_pset (ompi_instance_t * instance , int n , int * len , char * pset_name )
1093
1093
{
1094
- if (NULL == ompi_mpi_instance_pmix_psets && n >= ompi_instance_builtin_count ) {
1094
+ if (NULL == ompi_mpi_instance_pmix_psets ||
1095
+ (size_t ) n >= (ompi_instance_builtin_count + ompi_mpi_instance_num_pmix_psets )) {
1095
1096
ompi_instance_refresh_pmix_psets (PMIX_QUERY_PSET_NAMES );
1096
1097
}
1097
1098
@@ -1229,71 +1230,83 @@ static int ompi_instance_group_self (ompi_instance_t *instance, ompi_group_t **g
1229
1230
1230
1231
static int ompi_instance_group_pmix_pset (ompi_instance_t * instance , const char * pset_name , ompi_group_t * * group_out )
1231
1232
{
1233
+ int ret = OMPI_SUCCESS ;
1234
+ size_t i ,n ;
1235
+ bool isnew , try_again = false, refresh = true;
1232
1236
pmix_status_t rc ;
1233
- pmix_proc_t p ;
1234
- ompi_group_t * group ;
1235
- pmix_value_t * pval = NULL ;
1236
- char * stmp = NULL ;
1237
- size_t size = 0 ;
1238
-
1239
- /* make the group large enough to hold world */
1240
- group = ompi_group_allocate (NULL , ompi_process_info .num_procs );
1241
- if (OPAL_UNLIKELY (NULL == group )) {
1242
- return OMPI_ERR_OUT_OF_RESOURCE ;
1243
- }
1237
+ ompi_group_t * group = NULL ;
1238
+ pmix_query_t query ;
1239
+ pmix_info_t * info = NULL ;
1240
+ size_t ninfo ;
1241
+ opal_process_name_t pname ;
1244
1242
1243
+ PMIX_QUERY_CONSTRUCT (& query );
1244
+ PMIX_ARGV_APPEND (rc , query .keys , PMIX_QUERY_PSET_MEMBERSHIP );
1245
+ PMIX_INFO_CREATE (query .qualifiers , 1 );
1246
+ query .nqual = 1 ;
1247
+ PMIX_INFO_LOAD (& query .qualifiers [0 ], PMIX_PSET_NAME , pset_name , PMIX_STRING );
1245
1248
1246
- for (size_t i = 0 ; i < ompi_process_info .num_procs ; ++ i ) {
1247
- opal_process_name_t name = {.vpid = i , .jobid = OMPI_PROC_MY_NAME -> jobid };
1249
+ /*
1250
+ * First try finding in the local PMIx cache, if not found, try a refresh
1251
+ */
1252
+ fn_try_again :
1253
+ rc = PMIx_Query_info (& query , 1 , & info , & ninfo );
1254
+ if (PMIX_SUCCESS != (rc = PMIx_Query_info (& query , 1 , & info , & ninfo )) || 0 == ninfo ) {
1255
+ if ((PMIX_ERR_NOT_FOUND == rc ) && (false == try_again )) {
1256
+ try_again = true;
1257
+ PMIX_QUERY_DESTRUCT (& query );
1258
+ PMIX_QUERY_CONSTRUCT (& query );
1259
+ PMIX_ARGV_APPEND (rc , query .keys , PMIX_QUERY_PSET_MEMBERSHIP );
1260
+ PMIX_INFO_CREATE (query .qualifiers , 2 );
1261
+ PMIX_INFO_LOAD (& query .qualifiers [0 ], PMIX_PSET_NAME , pset_name , PMIX_STRING );
1262
+ PMIX_INFO_LOAD (& query .qualifiers [1 ], PMIX_QUERY_REFRESH_CACHE , & refresh , PMIX_BOOL );
1263
+ goto fn_try_again ;
1264
+ }
1265
+ ret = opal_pmix_convert_status (rc );
1266
+ ompi_instance_print_error ("PMIx_Query_info() failed" , ret );
1267
+ goto fn_w_query ;
1268
+ }
1248
1269
1249
- OPAL_PMIX_CONVERT_NAME (& p , & name );
1250
- rc = PMIx_Get (& p , PMIX_PSET_NAME , NULL , 0 , & pval );
1251
- if (OPAL_UNLIKELY (PMIX_SUCCESS != rc )) {
1252
- OBJ_RELEASE (group );
1253
- return opal_pmix_convert_status (rc );
1254
- }
1270
+ for (n = 0 ; n < ninfo ; n ++ ){
1271
+ if (0 == strcmp (info [n ].key , PMIX_QUERY_PSET_MEMBERSHIP )){
1272
+
1273
+ pmix_data_array_t * data_array = info [n ].value .data .darray ;
1274
+ pmix_proc_t * members_array = (pmix_proc_t * ) data_array -> array ;
1255
1275
1256
- PMIX_VALUE_UNLOAD (rc ,
1257
- pval ,
1258
- (void * * )& stmp ,
1259
- & size );
1260
- if (0 != strcmp (pset_name , stmp )) {
1261
- PMIX_VALUE_RELEASE (pval );
1262
- free (stmp );
1263
- continue ;
1264
- }
1265
- PMIX_VALUE_RELEASE (pval );
1266
- free (stmp );
1276
+ group = ompi_group_allocate (NULL , data_array -> size );
1277
+ if (OPAL_UNLIKELY (NULL == group )) {
1278
+ ret = OMPI_ERR_OUT_OF_RESOURCE ;
1279
+ goto fn_w_info ;
1280
+ }
1267
1281
1268
- /* look for existing ompi_proc_t that matches this name */
1269
- group -> grp_proc_pointers [size ] = (ompi_proc_t * ) ompi_proc_lookup (name );
1270
- if (NULL == group -> grp_proc_pointers [size ]) {
1271
- /* set sentinel value */
1272
- group -> grp_proc_pointers [size ] = (ompi_proc_t * ) ompi_proc_name_to_sentinel (name );
1273
- } else {
1274
- OBJ_RETAIN (group -> grp_proc_pointers [size ]);
1282
+ for (i = 0 ; i < data_array -> size ; i ++ ){
1283
+ OPAL_PMIX_CONVERT_PROCT (ret , & pname , & members_array [i ]);
1284
+ if (OPAL_SUCCESS == rc ) {
1285
+ group -> grp_proc_pointers [i ] = ompi_proc_find_and_add (& pname ,& isnew );
1286
+ } else {
1287
+ ompi_instance_print_error ("OPAL_PMIX_CONVERT_PROCT failed %d" , ret );
1288
+ ompi_group_free (& group );
1289
+ goto fn_w_info ;
1290
+ }
1291
+ }
1292
+ break ;
1275
1293
}
1276
- ++ size ;
1277
1294
}
1278
1295
1279
- /* shrink the proc array if needed */
1280
- if (size < (size_t ) group -> grp_proc_count ) {
1281
- void * tmp = realloc (group -> grp_proc_pointers , size * sizeof (group -> grp_proc_pointers [0 ]));
1282
- if (OPAL_UNLIKELY (NULL == tmp )) {
1283
- OBJ_RELEASE (group );
1284
- return OMPI_ERR_OUT_OF_RESOURCE ;
1285
- }
1286
-
1287
- group -> grp_proc_pointers = (ompi_proc_t * * ) tmp ;
1288
- group -> grp_proc_count = (int ) size ;
1296
+ if (NULL != group ) {
1297
+ ompi_set_group_rank (group , ompi_proc_local ());
1298
+ group -> grp_instance = instance ;
1299
+ * group_out = group ;
1300
+ } else {
1301
+ ret = OMPI_ERR_NOT_FOUND ;
1289
1302
}
1290
1303
1291
- ompi_set_group_rank (group , ompi_proc_local ());
1292
-
1293
- group -> grp_instance = instance ;
1304
+ fn_w_info :
1305
+ PMIX_INFO_DESTRUCT (info );
1306
+ fn_w_query :
1307
+ PMIX_QUERY_DESTRUCT (& query );
1294
1308
1295
- * group_out = group ;
1296
- return OMPI_SUCCESS ;
1309
+ return ret ;
1297
1310
}
1298
1311
1299
1312
static int ompi_instance_get_pmix_pset_size (ompi_instance_t * instance , const char * pset_name , size_t * size_out )
0 commit comments