From c4adbb966c89421dde3d0c4354ab0d6922b330e2 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Wed, 14 Dec 2022 14:01:31 -0700 Subject: [PATCH] remove redundant PMIx fence calls at initialization related to issue #11166 Turns out making these changes revealed some issues with the way MPI_Session_init is implemented. Unless async modex and disabling of collective info is turned on, the path through MPI initialization for Sessions isn't really local. The code in ompi_mpi_instance_init_common will eventually need to be refactored to move synchronizing elements up into ompi_mpi_init for the world process model and into some delayed-till- comm creation routine(s) when using the sessions process model. also, remove a second wait on a debugger. We should only do this once. At some point a solution for attaching a debugger to an application which only uses the sessions process model will need to implemented. Given the semantics of sessions, its not clear when is the right point to synchronize processes with a debugger. Signed-off-by: Howard Pritchard --- ompi/instance/instance.c | 48 +++++-------------------- ompi/instance/instance.h | 9 +++-- ompi/mpi/c/session_init.c | 2 +- ompi/runtime/ompi_mpi_init.c | 68 +++--------------------------------- 4 files changed, 20 insertions(+), 107 deletions(-) diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index 2d669ce8c4e..6c4b2c2dd20 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -331,13 +331,12 @@ static void evhandler_dereg_callbk(pmix_status_t status, /** * @brief Function that starts up the common components needed by all instances */ -static int ompi_mpi_instance_init_common (int argc, char **argv) +static int ompi_mpi_instance_init_common (int argc, char **argv, bool *background_fence) { int ret; ompi_proc_t **procs; size_t nprocs; volatile bool active; - bool background_fence = false; pmix_info_t info[2]; pmix_status_t rc; opal_pmix_lock_t mylock; @@ -402,7 +401,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) } OMPI_TIMING_NEXT("rte_init"); - OMPI_TIMING_IMPORT_OPAL("orte_ess_base_app_setup"); OMPI_TIMING_IMPORT_OPAL("rte_init"); ompi_rte_initialized = true; @@ -520,7 +518,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ret; /* TODO: need to fix this */ } - OMPI_TIMING_NEXT("commit"); + OMPI_TIMING_NEXT("commit"); #if (OPAL_ENABLE_TIMING) if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex && opal_pmix_collect_all_data && !opal_process_info.is_singleton) { @@ -548,7 +546,9 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) if (opal_pmix_collect_all_data) { /* execute the fence_nb in the background to collect * the data */ - background_fence = true; + if (NULL != background_fence) { + *background_fence = true; + } active = true; OPAL_POST_OBJECT(&active); PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL); @@ -709,39 +709,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ompi_instance_print_error ("ompi_mpi_init: ompi_comm_cid_init failed", ret); } - /* Do we need to wait for a debugger? */ - ompi_rte_wait_for_debugger(); - - /* Next timing measurement */ - OMPI_TIMING_NEXT("modex-barrier"); - - if (!opal_process_info.is_singleton) { - /* if we executed the above fence in the background, then - * we have to wait here for it to complete. However, there - * is no reason to do two barriers! */ - if (background_fence) { - OMPI_LAZY_WAIT_FOR_COMPLETION(active); - } else if (!ompi_async_mpi_init) { - /* wait for everyone to reach this point - this is a hard - * barrier requirement at this time, though we hope to relax - * it at a later point */ - bool flag = false; - active = true; - OPAL_POST_OBJECT(&active); - PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL); - if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1, - fence_release, (void*)&active))) { - ret = opal_pmix_convert_status(rc); - return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret); - } - OMPI_LAZY_WAIT_FOR_COMPLETION(active); - } - } - - /* check for timing request - get stop time and report elapsed - time if so, then start the clock again */ - OMPI_TIMING_NEXT("barrier"); - #if OPAL_ENABLE_PROGRESS_THREADS == 0 /* Start setting up the event engine for MPI operations. Don't block in the event library, so that communications don't take @@ -794,7 +761,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return OMPI_SUCCESS; } -int ompi_mpi_instance_init (int ts_level, opal_info_t *info, ompi_errhandler_t *errhandler, ompi_instance_t **instance, int argc, char **argv) +int ompi_mpi_instance_init (int ts_level, opal_info_t *info, ompi_errhandler_t *errhandler, ompi_instance_t **instance, int argc, char **argv, + bool *background_fence) { ompi_instance_t *new_instance; int ret; @@ -809,7 +777,7 @@ int ompi_mpi_instance_init (int ts_level, opal_info_t *info, ompi_errhandler_t opal_mutex_lock (&instance_lock); if (0 == opal_atomic_fetch_add_32 (&ompi_instance_count, 1)) { - ret = ompi_mpi_instance_init_common (argc, argv); + ret = ompi_mpi_instance_init_common (argc, argv, background_fence); if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { opal_mutex_unlock (&instance_lock); return ret; diff --git a/ompi/instance/instance.h b/ompi/instance/instance.h index f72f19525ae..a9f4ff19c8b 100644 --- a/ompi/instance/instance.h +++ b/ompi/instance/instance.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2018 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2018-2022 Triad National Security, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -122,9 +122,14 @@ void ompi_mpi_instance_release (void); * @param[in] ts_level thread support level (see mpi.h) * @param[in] info info object * @param[in] errhander errhandler to set on the instance + * @param[in] argc argument count to application + * @param[in] argv arrray of arguments to application + * @param[out] background_fence set to true if pmix non-blocking fence initiated + * @param[out] instance handler to instance */ OMPI_DECLSPEC int ompi_mpi_instance_init (int ts_level, opal_info_t *info, ompi_errhandler_t *errhandler, - ompi_instance_t **instance, int argc, char **argv); + ompi_instance_t **instance, int argc, char **argv, + bool *background_fence); /** * @brief Destroy an MPI instance and set it to MPI_SESSION_NULL diff --git a/ompi/mpi/c/session_init.c b/ompi/mpi/c/session_init.c index 74c6e6f2cc3..16b08de32f1 100644 --- a/ompi/mpi/c/session_init.c +++ b/ompi/mpi/c/session_init.c @@ -54,7 +54,7 @@ int MPI_Session_init (MPI_Info info, MPI_Errhandler errhandler, MPI_Session *ses } } - rc = ompi_mpi_instance_init (ts_level, &info->super, errhandler, session, 0, NULL); + rc = ompi_mpi_instance_init (ts_level, &info->super, errhandler, session, 0, NULL, NULL); /* if an error occurred raise it on the null session */ OMPI_ERRHANDLER_RETURN (rc, MPI_SESSION_NULL, rc, FUNC_NAME); } diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 19c0999d163..fbe93f27236 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -1,3 +1,4 @@ + /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana @@ -299,6 +300,7 @@ static void fence_release(pmix_status_t status, void *cbdata) OPAL_POST_OBJECT(active); } + int ompi_mpi_init(int argc, char **argv, int requested, int *provided, bool reinit_ok) { @@ -356,7 +358,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, ompi_mpi_thread_level(requested, provided); - ret = ompi_mpi_instance_init (*provided, &ompi_mpi_info_null.info.super, MPI_ERRORS_ARE_FATAL, &ompi_mpi_instance_default, argc, argv); + ret = ompi_mpi_instance_init (*provided, &ompi_mpi_info_null.info.super, MPI_ERRORS_ARE_FATAL, &ompi_mpi_instance_default, argc, argv, + &background_fence); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { error = "ompi_mpi_init: ompi_mpi_instance_init failed"; goto error; @@ -392,69 +395,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, free(tmp); } -#if (OPAL_ENABLE_TIMING) - if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex && - opal_pmix_collect_all_data && !opal_process_info.is_singleton) { - if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) { - ret = opal_pmix_convert_status(rc); - error = "timing: pmix-barrier-1 failed"; - goto error; - } - OMPI_TIMING_NEXT("pmix-barrier-1"); - if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) { - ret = opal_pmix_convert_status(rc); - error = "timing: pmix-barrier-2 failed"; - goto error; - } - OMPI_TIMING_NEXT("pmix-barrier-2"); - } -#endif - - if (!opal_process_info.is_singleton) { - if (opal_pmix_base_async_modex) { - /* if we are doing an async modex, but we are collecting all - * data, then execute the non-blocking modex in the background. - * All calls to modex_recv will be cached until the background - * modex completes. If collect_all_data is false, then we skip - * the fence completely and retrieve data on-demand from the - * source node. - */ - if (opal_pmix_collect_all_data) { - /* execute the fence_nb in the background to collect - * the data */ - background_fence = true; - active = true; - OPAL_POST_OBJECT(&active); - PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL); - if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0, - fence_release, - (void*)&active))) { - ret = opal_pmix_convert_status(rc); - error = "PMIx_Fence_nb() failed"; - goto error; - } - } - } else { - /* we want to do the modex - we block at this point, but we must - * do so in a manner that allows us to call opal_progress so our - * event library can be cycled as we have tied PMIx to that - * event base */ - active = true; - OPAL_POST_OBJECT(&active); - PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL); - rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active); - if( PMIX_SUCCESS != rc) { - ret = opal_pmix_convert_status(rc); - error = "PMIx_Fence() failed"; - goto error; - } - /* cannot just wait on thread as we need to call opal_progress */ - OMPI_LAZY_WAIT_FOR_COMPLETION(active); - } - } - - OMPI_TIMING_NEXT("modex"); - MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm)); MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));