diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index 2d669ce8c4e..50189570240 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2018-2022 Triad National Security, LLC. All rights + * Copyright (c) 2018-2023 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2022 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -18,6 +18,14 @@ #include "opal/util/show_help.h" #include "opal/util/argv.h" #include "opal/runtime/opal_params.h" +#include "opal/util/timings.h" +#include "opal/mca/allocator/base/base.h" +#include "opal/mca/rcache/base/base.h" +#include "opal/mca/mpool/base/base.h" +#include "opal/mca/smsc/base/base.h" +#include "opal/mca/mpool/base/mpool_base_tree.h" +#include "opal/mca/pmix/pmix-internal.h" +#include "opal/mca/pmix/base/base.h" #include "ompi/mca/pml/pml.h" #include "ompi/runtime/params.h" @@ -33,13 +41,10 @@ #include "ompi/dpm/dpm.h" #include "ompi/file/file.h" #include "ompi/mpiext/mpiext.h" +#include "ompi/util/timings.h" #include "ompi/mca/hook/base/base.h" #include "ompi/mca/op/base/base.h" -#include "opal/mca/allocator/base/base.h" -#include "opal/mca/rcache/base/base.h" -#include "opal/mca/mpool/base/base.h" -#include "opal/mca/smsc/base/base.h" #include "ompi/mca/bml/base/base.h" #include "ompi/mca/pml/base/base.h" #include "ompi/mca/coll/base/base.h" @@ -47,12 +52,8 @@ #include "ompi/mca/part/base/base.h" #include "ompi/mca/io/base/base.h" #include "ompi/mca/topo/base/base.h" -#include "opal/mca/pmix/base/base.h" -#include "opal/mca/mpool/base/mpool_base_tree.h" #include "ompi/mca/pml/base/pml_base_bsend.h" -#include "ompi/util/timings.h" -#include "opal/mca/pmix/pmix-internal.h" ompi_predefined_instance_t ompi_mpi_instance_null = {{{{0}}}}; @@ -341,7 +342,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) pmix_info_t info[2]; pmix_status_t rc; opal_pmix_lock_t mylock; - OMPI_TIMING_INIT(64); + + OPAL_TIMING_ENV_INIT(init_common); ret = ompi_mpi_instance_retain (); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -382,13 +384,15 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL); } - OMPI_TIMING_NEXT("initialization"); + OPAL_TIMING_ENV_NEXT(init_common, "initialization"); /* Setup RTE */ if (OMPI_SUCCESS != (ret = ompi_rte_init (&argc, &argv))) { return ompi_instance_print_error ("ompi_mpi_init: ompi_rte_init failed", ret); } + OPAL_TIMING_ENV_NEXT(init_common, "ompi_rte_init"); + /* open the ompi hook framework */ for (int i = 0 ; ompi_framework_dependencies[i] ; ++i) { ret = mca_base_framework_open (ompi_framework_dependencies[i], 0); @@ -401,10 +405,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) } } - OMPI_TIMING_NEXT("rte_init"); - OMPI_TIMING_IMPORT_OPAL("orte_ess_base_app_setup"); - OMPI_TIMING_IMPORT_OPAL("rte_init"); - ompi_rte_initialized = true; /* if we are oversubscribed, then set yield_when_idle * accordingly */ @@ -507,9 +507,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ompi_instance_print_error ("mca_pml_base_select() failed", ret); } - OMPI_TIMING_IMPORT_OPAL("orte_init"); - OMPI_TIMING_NEXT("rte_init-commit"); - /* exchange connection info - this function may also act as a barrier * if data exchange is required. The modex occurs solely across procs * in our job. If a barrier is required, the "modex" function will @@ -520,7 +517,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ret; /* TODO: need to fix this */ } - OMPI_TIMING_NEXT("commit"); + OPAL_TIMING_ENV_NEXT(init_common, "PMIx_Commit"); + #if (OPAL_ENABLE_TIMING) if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex && opal_pmix_collect_all_data && !opal_process_info.is_singleton) { @@ -528,11 +526,11 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) ret = opal_pmix_convert_status(rc); return ompi_instance_print_error ("timing: pmix-barrier-1 failed", ret); } - OMPI_TIMING_NEXT("pmix-barrier-1"); + OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-1"); if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) { return ompi_instance_print_error ("timing: pmix-barrier-2 failed", ret); } - OMPI_TIMING_NEXT("pmix-barrier-2"); + OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-2"); } #endif @@ -577,7 +575,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) } } - OMPI_TIMING_NEXT("modex"); + OPAL_TIMING_ENV_NEXT(init_common, "modex"); /* select buffered send allocator component to be used */ if (OMPI_SUCCESS != (ret = mca_pml_base_bsend_init ())) { @@ -625,14 +623,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ompi_instance_print_error ("ompi_attr_create_predefined_keyvals() failed", ret); } - if (mca_pml_base_requires_world ()) { - /* need to set up comm world for this instance -- XXX -- FIXME -- probably won't always - * be the case. */ - if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) { - return ompi_instance_print_error ("ompi_comm_init_mpi3 () failed", ret); - } - } - /* initialize file handles */ if (OMPI_SUCCESS != (ret = ompi_file_init ())) { return ompi_instance_print_error ("ompi_file_init() failed", ret); @@ -709,39 +699,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ompi_instance_print_error ("ompi_mpi_init: ompi_comm_cid_init failed", ret); } - /* Do we need to wait for a debugger? */ - ompi_rte_wait_for_debugger(); - - /* Next timing measurement */ - OMPI_TIMING_NEXT("modex-barrier"); - - if (!opal_process_info.is_singleton) { - /* if we executed the above fence in the background, then - * we have to wait here for it to complete. However, there - * is no reason to do two barriers! */ - if (background_fence) { - OMPI_LAZY_WAIT_FOR_COMPLETION(active); - } else if (!ompi_async_mpi_init) { - /* wait for everyone to reach this point - this is a hard - * barrier requirement at this time, though we hope to relax - * it at a later point */ - bool flag = false; - active = true; - OPAL_POST_OBJECT(&active); - PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL); - if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1, - fence_release, (void*)&active))) { - ret = opal_pmix_convert_status(rc); - return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret); - } - OMPI_LAZY_WAIT_FOR_COMPLETION(active); - } - } - - /* check for timing request - get stop time and report elapsed - time if so, then start the clock again */ - OMPI_TIMING_NEXT("barrier"); - #if OPAL_ENABLE_PROGRESS_THREADS == 0 /* Start setting up the event engine for MPI operations. Don't block in the event library, so that communications don't take @@ -749,7 +706,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) CPU utilization for the remainder of MPI_INIT when we are blocking on RTE-level events, but may greatly reduce non-TCP latency. */ - opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK); + int old_event_flags = opal_progress_set_event_flag(0); + opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK); #endif /* Undo OPAL calling opal_progress_event_users_increment() during @@ -791,6 +749,10 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t); opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP); + if (background_fence) { + OMPI_LAZY_WAIT_FOR_COMPLETION(active); + } + return OMPI_SUCCESS; } diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 19c0999d163..173bf548e67 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -27,7 +27,7 @@ * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. * All Rights reserved. * Copyright (c) 2021 Nanook Consulting. All rights reserved. - * Copyright (c) 2021-2022 Triad National Security, LLC. All rights + * Copyright (c) 2021-2023 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -299,16 +299,16 @@ static void fence_release(pmix_status_t status, void *cbdata) OPAL_POST_OBJECT(active); } + int ompi_mpi_init(int argc, char **argv, int requested, int *provided, bool reinit_ok) { int ret; char *error = NULL; + bool active = false; #if OPAL_USING_INTERNAL_PMIX char *evar; #endif - volatile bool active; - bool background_fence = false; pmix_info_t info[2]; pmix_status_t rc; OMPI_TIMING_INIT(64); @@ -392,69 +392,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, free(tmp); } -#if (OPAL_ENABLE_TIMING) - if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex && - opal_pmix_collect_all_data && !opal_process_info.is_singleton) { - if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) { - ret = opal_pmix_convert_status(rc); - error = "timing: pmix-barrier-1 failed"; - goto error; - } - OMPI_TIMING_NEXT("pmix-barrier-1"); - if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) { - ret = opal_pmix_convert_status(rc); - error = "timing: pmix-barrier-2 failed"; - goto error; - } - OMPI_TIMING_NEXT("pmix-barrier-2"); - } -#endif - - if (!opal_process_info.is_singleton) { - if (opal_pmix_base_async_modex) { - /* if we are doing an async modex, but we are collecting all - * data, then execute the non-blocking modex in the background. - * All calls to modex_recv will be cached until the background - * modex completes. If collect_all_data is false, then we skip - * the fence completely and retrieve data on-demand from the - * source node. - */ - if (opal_pmix_collect_all_data) { - /* execute the fence_nb in the background to collect - * the data */ - background_fence = true; - active = true; - OPAL_POST_OBJECT(&active); - PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL); - if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0, - fence_release, - (void*)&active))) { - ret = opal_pmix_convert_status(rc); - error = "PMIx_Fence_nb() failed"; - goto error; - } - } - } else { - /* we want to do the modex - we block at this point, but we must - * do so in a manner that allows us to call opal_progress so our - * event library can be cycled as we have tied PMIx to that - * event base */ - active = true; - OPAL_POST_OBJECT(&active); - PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL); - rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active); - if( PMIX_SUCCESS != rc) { - ret = opal_pmix_convert_status(rc); - error = "PMIx_Fence() failed"; - goto error; - } - /* cannot just wait on thread as we need to call opal_progress */ - OMPI_LAZY_WAIT_FOR_COMPLETION(active); - } - } - - OMPI_TIMING_NEXT("modex"); - MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm)); MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm)); @@ -495,15 +432,13 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, OMPI_TIMING_NEXT("modex-barrier"); if (!opal_process_info.is_singleton) { - /* if we executed the above fence in the background, then - * we have to wait here for it to complete. However, there - * is no reason to do two barriers! */ - if (background_fence) { - OMPI_LAZY_WAIT_FOR_COMPLETION(active); - } else if (!ompi_async_mpi_init) { + if (!ompi_async_mpi_init) { /* wait for everyone to reach this point - this is a hard * barrier requirement at this time, though we hope to relax - * it at a later point */ + * it at a later point. Right now at least OB1 PML needs this + * PMIx_Fence to make sure state associated with adding MPI_COMM_WORLD + * to the PML has been set prior to a process sending to another + * using the OB1 PML. */ bool flag = false; active = true; OPAL_POST_OBJECT(&active); @@ -517,22 +452,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, OMPI_LAZY_WAIT_FOR_COMPLETION(active); } } - - /* check for timing request - get stop time and report elapsed - time if so, then start the clock again */ OMPI_TIMING_NEXT("barrier"); -#if OPAL_ENABLE_PROGRESS_THREADS == 0 - /* Start setting up the event engine for MPI operations. Don't - block in the event library, so that communications don't take - forever between procs in the dynamic code. This will increase - CPU utilization for the remainder of MPI_INIT when we are - blocking on RTE-level events, but may greatly reduce non-TCP - latency. */ - int old_event_flags = opal_progress_set_event_flag(0); - opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK); -#endif - /* wire up the mpi interface, if requested. Do this after the non-block switch for non-TCP performance. Do before the polling change as anyone with a complex wire-up is going to be @@ -592,6 +513,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, opal_atomic_wmb(); opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_COMPLETED); + OMPI_TIMING_IMPORT_OPAL("opal_init_util"); + OMPI_TIMING_IMPORT_OPAL("opal_init"); + OMPI_TIMING_IMPORT_OPAL("ompi_mpi_instance_init_common"); + /* Finish last measurement, output results * and clear timing structure */ OMPI_TIMING_NEXT("barrier-finish");