Skip to content

PMIx_Fences - remove two during MPI initialization #11451

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 26 additions & 64 deletions ompi/instance/instance.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
* Copyright (c) 2018-2023 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2022 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
Expand All @@ -18,6 +18,14 @@
#include "opal/util/show_help.h"
#include "opal/util/argv.h"
#include "opal/runtime/opal_params.h"
#include "opal/util/timings.h"
#include "opal/mca/allocator/base/base.h"
#include "opal/mca/rcache/base/base.h"
#include "opal/mca/mpool/base/base.h"
#include "opal/mca/smsc/base/base.h"
#include "opal/mca/mpool/base/mpool_base_tree.h"
#include "opal/mca/pmix/pmix-internal.h"
#include "opal/mca/pmix/base/base.h"

#include "ompi/mca/pml/pml.h"
#include "ompi/runtime/params.h"
Expand All @@ -33,26 +41,19 @@
#include "ompi/dpm/dpm.h"
#include "ompi/file/file.h"
#include "ompi/mpiext/mpiext.h"
#include "ompi/util/timings.h"

#include "ompi/mca/hook/base/base.h"
#include "ompi/mca/op/base/base.h"
#include "opal/mca/allocator/base/base.h"
#include "opal/mca/rcache/base/base.h"
#include "opal/mca/mpool/base/base.h"
#include "opal/mca/smsc/base/base.h"
#include "ompi/mca/bml/base/base.h"
#include "ompi/mca/pml/base/base.h"
#include "ompi/mca/coll/base/base.h"
#include "ompi/mca/osc/base/base.h"
#include "ompi/mca/part/base/base.h"
#include "ompi/mca/io/base/base.h"
#include "ompi/mca/topo/base/base.h"
#include "opal/mca/pmix/base/base.h"

#include "opal/mca/mpool/base/mpool_base_tree.h"
#include "ompi/mca/pml/base/pml_base_bsend.h"
#include "ompi/util/timings.h"
#include "opal/mca/pmix/pmix-internal.h"

ompi_predefined_instance_t ompi_mpi_instance_null = {{{{0}}}};

Expand Down Expand Up @@ -341,7 +342,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
pmix_info_t info[2];
pmix_status_t rc;
opal_pmix_lock_t mylock;
OMPI_TIMING_INIT(64);

OPAL_TIMING_ENV_INIT(init_common);

ret = ompi_mpi_instance_retain ();
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
Expand Down Expand Up @@ -382,13 +384,15 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
}

OMPI_TIMING_NEXT("initialization");
OPAL_TIMING_ENV_NEXT(init_common, "initialization");

/* Setup RTE */
if (OMPI_SUCCESS != (ret = ompi_rte_init (&argc, &argv))) {
return ompi_instance_print_error ("ompi_mpi_init: ompi_rte_init failed", ret);
}

OPAL_TIMING_ENV_NEXT(init_common, "ompi_rte_init");

/* open the ompi hook framework */
for (int i = 0 ; ompi_framework_dependencies[i] ; ++i) {
ret = mca_base_framework_open (ompi_framework_dependencies[i], 0);
Expand All @@ -401,10 +405,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
}
}

OMPI_TIMING_NEXT("rte_init");
OMPI_TIMING_IMPORT_OPAL("orte_ess_base_app_setup");
OMPI_TIMING_IMPORT_OPAL("rte_init");

ompi_rte_initialized = true;
/* if we are oversubscribed, then set yield_when_idle
* accordingly */
Expand Down Expand Up @@ -507,9 +507,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
return ompi_instance_print_error ("mca_pml_base_select() failed", ret);
}

OMPI_TIMING_IMPORT_OPAL("orte_init");
OMPI_TIMING_NEXT("rte_init-commit");

/* exchange connection info - this function may also act as a barrier
* if data exchange is required. The modex occurs solely across procs
* in our job. If a barrier is required, the "modex" function will
Expand All @@ -520,19 +517,20 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
return ret; /* TODO: need to fix this */
}

OMPI_TIMING_NEXT("commit");
OPAL_TIMING_ENV_NEXT(init_common, "PMIx_Commit");

#if (OPAL_ENABLE_TIMING)
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
opal_pmix_collect_all_data && !opal_process_info.is_singleton) {
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
ret = opal_pmix_convert_status(rc);
return ompi_instance_print_error ("timing: pmix-barrier-1 failed", ret);
}
OMPI_TIMING_NEXT("pmix-barrier-1");
OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-1");
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
return ompi_instance_print_error ("timing: pmix-barrier-2 failed", ret);
}
OMPI_TIMING_NEXT("pmix-barrier-2");
OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-2");
}
#endif

Expand Down Expand Up @@ -577,7 +575,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
}
}

OMPI_TIMING_NEXT("modex");
OPAL_TIMING_ENV_NEXT(init_common, "modex");

/* select buffered send allocator component to be used */
if (OMPI_SUCCESS != (ret = mca_pml_base_bsend_init ())) {
Expand Down Expand Up @@ -625,14 +623,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
return ompi_instance_print_error ("ompi_attr_create_predefined_keyvals() failed", ret);
}

if (mca_pml_base_requires_world ()) {
/* need to set up comm world for this instance -- XXX -- FIXME -- probably won't always
* be the case. */
if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) {
return ompi_instance_print_error ("ompi_comm_init_mpi3 () failed", ret);
}
}

/* initialize file handles */
if (OMPI_SUCCESS != (ret = ompi_file_init ())) {
return ompi_instance_print_error ("ompi_file_init() failed", ret);
Expand Down Expand Up @@ -709,47 +699,15 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
return ompi_instance_print_error ("ompi_mpi_init: ompi_comm_cid_init failed", ret);
}

/* Do we need to wait for a debugger? */
ompi_rte_wait_for_debugger();

/* Next timing measurement */
OMPI_TIMING_NEXT("modex-barrier");

if (!opal_process_info.is_singleton) {
/* if we executed the above fence in the background, then
* we have to wait here for it to complete. However, there
* is no reason to do two barriers! */
if (background_fence) {
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
} else if (!ompi_async_mpi_init) {
/* wait for everyone to reach this point - this is a hard
* barrier requirement at this time, though we hope to relax
* it at a later point */
bool flag = false;
active = true;
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1,
fence_release, (void*)&active))) {
ret = opal_pmix_convert_status(rc);
return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret);
}
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
}

/* check for timing request - get stop time and report elapsed
time if so, then start the clock again */
OMPI_TIMING_NEXT("barrier");

#if OPAL_ENABLE_PROGRESS_THREADS == 0
/* Start setting up the event engine for MPI operations. Don't
block in the event library, so that communications don't take
forever between procs in the dynamic code. This will increase
CPU utilization for the remainder of MPI_INIT when we are
blocking on RTE-level events, but may greatly reduce non-TCP
latency. */
opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
int old_event_flags = opal_progress_set_event_flag(0);
opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
#endif

/* Undo OPAL calling opal_progress_event_users_increment() during
Expand Down Expand Up @@ -791,6 +749,10 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);

if (background_fence) {
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}

return OMPI_SUCCESS;
}

Expand Down
99 changes: 12 additions & 87 deletions ompi/runtime/ompi_mpi_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
* All Rights reserved.
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2022 Triad National Security, LLC. All rights
* Copyright (c) 2021-2023 Triad National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -299,16 +299,16 @@ static void fence_release(pmix_status_t status, void *cbdata)
OPAL_POST_OBJECT(active);
}


int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
bool reinit_ok)
{
int ret;
char *error = NULL;
bool active = false;
#if OPAL_USING_INTERNAL_PMIX
char *evar;
#endif
volatile bool active;
bool background_fence = false;
pmix_info_t info[2];
pmix_status_t rc;
OMPI_TIMING_INIT(64);
Expand Down Expand Up @@ -392,69 +392,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
free(tmp);
}

#if (OPAL_ENABLE_TIMING)
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
opal_pmix_collect_all_data && !opal_process_info.is_singleton) {
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
ret = opal_pmix_convert_status(rc);
error = "timing: pmix-barrier-1 failed";
goto error;
}
OMPI_TIMING_NEXT("pmix-barrier-1");
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
ret = opal_pmix_convert_status(rc);
error = "timing: pmix-barrier-2 failed";
goto error;
}
OMPI_TIMING_NEXT("pmix-barrier-2");
}
#endif

if (!opal_process_info.is_singleton) {
if (opal_pmix_base_async_modex) {
/* if we are doing an async modex, but we are collecting all
* data, then execute the non-blocking modex in the background.
* All calls to modex_recv will be cached until the background
* modex completes. If collect_all_data is false, then we skip
* the fence completely and retrieve data on-demand from the
* source node.
*/
if (opal_pmix_collect_all_data) {
/* execute the fence_nb in the background to collect
* the data */
background_fence = true;
active = true;
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0,
fence_release,
(void*)&active))) {
ret = opal_pmix_convert_status(rc);
error = "PMIx_Fence_nb() failed";
goto error;
}
}
} else {
/* we want to do the modex - we block at this point, but we must
* do so in a manner that allows us to call opal_progress so our
* event library can be cycled as we have tied PMIx to that
* event base */
active = true;
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
if( PMIX_SUCCESS != rc) {
ret = opal_pmix_convert_status(rc);
error = "PMIx_Fence() failed";
goto error;
}
/* cannot just wait on thread as we need to call opal_progress */
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
}

OMPI_TIMING_NEXT("modex");

MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));

Expand Down Expand Up @@ -495,15 +432,13 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
OMPI_TIMING_NEXT("modex-barrier");

if (!opal_process_info.is_singleton) {
/* if we executed the above fence in the background, then
* we have to wait here for it to complete. However, there
* is no reason to do two barriers! */
if (background_fence) {
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
} else if (!ompi_async_mpi_init) {
if (!ompi_async_mpi_init) {
/* wait for everyone to reach this point - this is a hard
* barrier requirement at this time, though we hope to relax
* it at a later point */
* it at a later point. Right now at least OB1 PML needs this
* PMIx_Fence to make sure state associated with adding MPI_COMM_WORLD
* to the PML has been set prior to a process sending to another
* using the OB1 PML. */
bool flag = false;
active = true;
OPAL_POST_OBJECT(&active);
Expand All @@ -517,22 +452,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
}

/* check for timing request - get stop time and report elapsed
time if so, then start the clock again */
OMPI_TIMING_NEXT("barrier");

#if OPAL_ENABLE_PROGRESS_THREADS == 0
/* Start setting up the event engine for MPI operations. Don't
block in the event library, so that communications don't take
forever between procs in the dynamic code. This will increase
CPU utilization for the remainder of MPI_INIT when we are
blocking on RTE-level events, but may greatly reduce non-TCP
latency. */
int old_event_flags = opal_progress_set_event_flag(0);
opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
#endif

/* wire up the mpi interface, if requested. Do this after the
non-block switch for non-TCP performance. Do before the
polling change as anyone with a complex wire-up is going to be
Expand Down Expand Up @@ -592,6 +513,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
opal_atomic_wmb();
opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_COMPLETED);

OMPI_TIMING_IMPORT_OPAL("opal_init_util");
OMPI_TIMING_IMPORT_OPAL("opal_init");
OMPI_TIMING_IMPORT_OPAL("ompi_mpi_instance_init_common");

/* Finish last measurement, output results
* and clear timing structure */
OMPI_TIMING_NEXT("barrier-finish");
Expand Down