@@ -1204,71 +1204,84 @@ static int ompi_instance_group_self (ompi_instance_t *instance, ompi_group_t **g
1204
1204
1205
1205
static int ompi_instance_group_pmix_pset (ompi_instance_t * instance , const char * pset_name , ompi_group_t * * group_out )
1206
1206
{
1207
+ int ret = OMPI_SUCCESS ;
1208
+ size_t i ,n ;
1209
+ bool isnew , try_again = false, refresh = true;
1207
1210
pmix_status_t rc ;
1208
- pmix_proc_t p ;
1209
- ompi_group_t * group ;
1210
- pmix_value_t * pval = NULL ;
1211
- char * stmp = NULL ;
1212
- size_t size = 0 ;
1213
-
1214
- /* make the group large enough to hold world */
1215
- group = ompi_group_allocate (NULL , ompi_process_info .num_procs );
1216
- if (OPAL_UNLIKELY (NULL == group )) {
1217
- return OMPI_ERR_OUT_OF_RESOURCE ;
1218
- }
1211
+ ompi_group_t * group = NULL ;
1212
+ pmix_query_t query ;
1213
+ pmix_info_t * info = NULL ;
1214
+ size_t ninfo ;
1215
+ opal_process_name_t pname ;
1219
1216
1217
+ PMIX_QUERY_CONSTRUCT (& query );
1218
+ PMIX_ARGV_APPEND (rc , query .keys , PMIX_QUERY_PSET_MEMBERSHIP );
1219
+ PMIX_INFO_CREATE (query .qualifiers , 1 );
1220
+ PMIX_INFO_LOAD (& query .qualifiers [0 ], PMIX_PSET_NAME , pset_name , PMIX_STRING );
1220
1221
1221
- for (size_t i = 0 ; i < ompi_process_info .num_procs ; ++ i ) {
1222
- opal_process_name_t name = {.vpid = i , .jobid = OMPI_PROC_MY_NAME -> jobid };
1222
+ /*
1223
+ * First try finding in the local PMIx cache, if not found, try a refresh
1224
+ */
1225
+ fn_try_again :
1226
+ rc = PMIx_Query_info (& query , 1 , & info , & ninfo );
1227
+ if (PMIX_SUCCESS != (rc = PMIx_Query_info (& query , 1 , & info , & ninfo )) || 0 == ninfo ) {
1228
+ if ((PMIX_ERR_NOT_FOUND == rc ) && (false == try_again )) {
1229
+ try_again = true;
1230
+ PMIX_QUERY_DESTRUCT (& query );
1231
+ PMIX_QUERY_CONSTRUCT (& query );
1232
+ PMIX_ARGV_APPEND (rc , query .keys , PMIX_QUERY_PSET_MEMBERSHIP );
1233
+ PMIX_INFO_CREATE (query .qualifiers , 2 );
1234
+ PMIX_INFO_LOAD (& query .qualifiers [0 ], PMIX_PSET_NAME , pset_name , PMIX_STRING );
1235
+ PMIX_INFO_LOAD (& query .qualifiers [1 ], PMIX_QUERY_REFRESH_CACHE , & refresh , PMIX_BOOL );
1236
+ goto fn_try_again ;
1237
+ }
1238
+ ret = opal_pmix_convert_status (rc );
1239
+ ompi_instance_print_error ("PMIx_Query_info() failed" , ret );
1240
+ goto fn_w_query ;
1241
+ }
1223
1242
1224
- OPAL_PMIX_CONVERT_NAME ( & p , & name );
1225
- rc = PMIx_Get ( & p , PMIX_PSET_NAME , NULL , 0 , & pval );
1226
- if ( OPAL_UNLIKELY ( PMIX_SUCCESS != rc )) {
1227
- OBJ_RELEASE ( group );
1228
- return opal_pmix_convert_status ( rc ) ;
1229
- }
1243
+ for ( n = 0 ; n < ninfo ; n ++ ){
1244
+ if ( 0 == strcmp ( info [ n ]. key , PMIX_QUERY_PSET_MEMBERSHIP )){
1245
+
1246
+ assert ( info [ n ]. value == PMIX_DATA_ARRAY );
1247
+ pmix_data_array_t * data_array = info [ n ]. value . data . darray ;
1248
+ pmix_proc_t * members_array = ( pmix_proc_t * ) data_array -> array ;
1230
1249
1231
- PMIX_VALUE_UNLOAD (rc ,
1232
- pval ,
1233
- (void * * )& stmp ,
1234
- & size );
1235
- if (0 != strcmp (pset_name , stmp )) {
1236
- PMIX_VALUE_RELEASE (pval );
1237
- free (stmp );
1238
- continue ;
1239
- }
1240
- PMIX_VALUE_RELEASE (pval );
1241
- free (stmp );
1250
+ group = ompi_group_allocate (NULL , data_array -> size );
1251
+ if (OPAL_UNLIKELY (NULL == group )) {
1252
+ ret = OMPI_ERR_OUT_OF_RESOURCE ;
1253
+ goto fn_w_info ;
1254
+ }
1242
1255
1243
- /* look for existing ompi_proc_t that matches this name */
1244
- group -> grp_proc_pointers [size ] = (ompi_proc_t * ) ompi_proc_lookup (name );
1245
- if (NULL == group -> grp_proc_pointers [size ]) {
1246
- /* set sentinel value */
1247
- group -> grp_proc_pointers [size ] = (ompi_proc_t * ) ompi_proc_name_to_sentinel (name );
1248
- } else {
1249
- OBJ_RETAIN (group -> grp_proc_pointers [size ]);
1256
+ for (i = 0 ; i < data_array -> size ; i ++ ){
1257
+ OPAL_PMIX_CONVERT_PROCT (ret , & pname , & members_array [i ]);
1258
+ if (OPAL_SUCCESS == rc ) {
1259
+ group -> grp_proc_pointers [i ] = ompi_proc_find_and_add (& pname ,& isnew );
1260
+ } else {
1261
+ ompi_instance_print_error ("OPAL_PMIX_CONVERT_PROCT failed %d" , ret );
1262
+ ompi_group_free (& group );
1263
+ goto fn_w_info ;
1264
+ }
1265
+ }
1266
+ break ;
1250
1267
}
1251
- ++ size ;
1252
1268
}
1253
1269
1254
- /* shrink the proc array if needed */
1255
- if (size < (size_t ) group -> grp_proc_count ) {
1256
- void * tmp = realloc (group -> grp_proc_pointers , size * sizeof (group -> grp_proc_pointers [0 ]));
1257
- if (OPAL_UNLIKELY (NULL == tmp )) {
1258
- OBJ_RELEASE (group );
1259
- return OMPI_ERR_OUT_OF_RESOURCE ;
1260
- }
1261
-
1262
- group -> grp_proc_pointers = (ompi_proc_t * * ) tmp ;
1263
- group -> grp_proc_count = (int ) size ;
1270
+ if (NULL != group ) {
1271
+ ompi_set_group_rank (group , ompi_proc_local ());
1272
+ group -> grp_instance = instance ;
1273
+ * group_out = group ;
1274
+ } else {
1275
+ ompi_instance_print_error ("PMIx_Query_info did not return membership list for pset %s" , pset_name );
1276
+ ret = OMPI_ERR_NOT_FOUND ;
1264
1277
}
1265
1278
1266
- ompi_set_group_rank (group , ompi_proc_local ());
1279
+ fn_w_info :
1280
+ PMIX_INFO_DESTRUCT (info );
1281
+ fn_w_query :
1282
+ PMIX_QUERY_DESTRUCT (& query );
1267
1283
1268
- group -> grp_instance = instance ;
1269
-
1270
- * group_out = group ;
1271
- return OMPI_SUCCESS ;
1284
+ return ret ;
1272
1285
}
1273
1286
1274
1287
static int ompi_instance_get_pmix_pset_size (ompi_instance_t * instance , const char * pset_name , size_t * size_out )
0 commit comments