Skip to content

Commit 6993304

Browse files
committed
ofi/common: fix code that broke sessions
With sessions initialization model (section 11.3 of MPI 4 standard) MPI may be initialized and finalized any number of times. This patch refactors code that was assuming a one shot init/finalize sequence for initializing Open MPI and its MCA param space The underlying problem with the replaced code was that an app call MPI_Session_finalize and if there are no more instances active, the MCA param space is destroyed. So if one does not build Open MPI to use dynamically load frameworks, and are using static variables in a way that assumes the MCA param space is always preserved if a static variable is set to some value, then things break if a subsequent MPI_Session_init is invoked. Related to open-mpi#12869 Signed-off-by: Howard Pritchard <[email protected]>
1 parent 86961a2 commit 6993304

File tree

1 file changed

+22
-8
lines changed

1 file changed

+22
-8
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -324,10 +324,11 @@ int opal_common_ofi_providers_subset_of_list(struct fi_info *provider_list, char
324324

325325
int opal_common_ofi_mca_register(const mca_base_component_t *component)
326326
{
327-
static int include_index = -1;
328-
static int exclude_index = -1;
329-
static int verbose_index = -1;
330-
static int accelerator_rank_index = -1;
327+
static int include_index;
328+
static int exclude_index;
329+
static int verbose_index;
330+
static int accelerator_rank_index;
331+
int param;
331332
int ret;
332333

333334
if (fi_version() < FI_VERSION(1, 0)) {
@@ -336,7 +337,8 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
336337

337338
OPAL_THREAD_LOCK(&opal_common_ofi_mutex);
338339

339-
if (0 > include_index) {
340+
param = mca_base_var_find("opal", "opal_common", "ofi", "provider_include");
341+
if (0 > param) {
340342
/*
341343
* this monkey business is needed because of the way the MCA VARs stuff tries to handle
342344
* pointers to strings when when destructing the MCA var database. If you don't do
@@ -359,9 +361,12 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
359361
ret = include_index;
360362
goto err;
361363
}
364+
} else {
365+
include_index = param;
362366
}
363367

364-
if (0 > exclude_index) {
368+
param = mca_base_var_find("opal", "opal_common", "ofi", "provider_exclude");
369+
if (0 > param) {
365370
if (NULL == opal_common_ofi.prov_exclude) {
366371
opal_common_ofi.prov_exclude = (char **) malloc(sizeof(char *));
367372
assert(NULL != opal_common_ofi.prov_exclude);
@@ -378,9 +383,12 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
378383
ret = exclude_index;
379384
goto err;
380385
}
386+
} else {
387+
exclude_index = param;
381388
}
382389

383-
if (0 > verbose_index) {
390+
param = mca_base_var_find("opal", "opal_common", "ofi", "verbose");
391+
if (0 > param) {
384392
verbose_index = mca_base_var_register("opal", "opal_common", "ofi", "verbose",
385393
"Verbose level of the OFI components",
386394
MCA_BASE_VAR_TYPE_INT, NULL, 0,
@@ -391,9 +399,13 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
391399
ret = verbose_index;
392400
goto err;
393401
}
402+
} else {
403+
verbose_index = param;
394404
}
395405

396-
if (0 > accelerator_rank_index) {
406+
407+
param = mca_base_var_find("opal", "opal_common", "ofi", "accelerator_rank");
408+
if (0 > param) {
397409
accelerator_rank_index
398410
= mca_base_var_register("opal", "opal_common", "ofi", "accelerator_rank",
399411
"Process rank(non-negative) on the selected accelerator device",
@@ -404,6 +416,8 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
404416
ret = accelerator_rank_index;
405417
goto err;
406418
}
419+
} else {
420+
accelerator_rank_index = param;
407421
}
408422

409423
if (component) {

0 commit comments

Comments
 (0)