From b3ae7589c89ca53f8c837dbe537ad84f5ddbdfdf Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Fri, 13 Jan 2023 12:32:44 -0700 Subject: [PATCH 1/2] PMIx_Fences - remove unneeded ones during MPI initialization This patch removes redundant PMIx Fences in the initialization procedure for MPI when using the World Process Model (WPM). See chapter 11 sections 2 and 3 of the MPI-4 standard for a discussion of the WPM and new Sessions model. The patch does, however, require that what should have been a local operation to support initialization of an MPI session, into a global one. Note this does not disable the sessions feature but just restricts when it will work at this point to use cases that are similar to MPI initialization using the WPM. Refactoring to make ompi_mpi_instance_init_common purely local will require changes that would be too impactive for the current state of the 5.0.0 release cycle. See issue #11239. This patch also fixes up the timings reported when building using the timing infrastructure: mpirun -np 8 ./ring_c ------------------ ompi_mpi_init ------------------ -- [opal_init_core.c:opal_init_util:opal_malloc_init]: 0.000031 / 0.000023 / 0.000043 -- [opal_init_core.c:opal_init_util:opal_show_help_init]: 0.000094 / 0.000085 / 0.000108 -- [opal_init_core.c:opal_init_util:opal_var_init]: 0.000002 / 0.000001 / 0.000003 -- [opal_init_core.c:opal_init_util:opal_var_cache]: 0.000399 / 0.000345 / 0.000442 -- [opal_init_core.c:opal_init_util:opal_arch_init]: 0.000057 / 0.000054 / 0.000065 -- [opal_init_core.c:opal_init_util:mca_base_open]: 0.000201 / 0.000178 / 0.000243 !! [opal_init_core.c:opal_init_util:total]: 0.000784 / 0.000686 / 0.000904 -- [opal_init.c:opal_init:opal_if_init]: 0.000074 / 0.000062 / 0.000084 -- [opal_init.c:opal_init:opal_init_psm]: 0.000010 / 0.000009 / 0.000011 -- [opal_init.c:opal_init:opal_net_init]: 0.000010 / 0.000008 / 0.000012 -- [opal_init.c:opal_init:opal_datatype_init]: 0.003596 / 0.000519 / 0.012865 !! [opal_init.c:opal_init:total]: 0.003689 / 0.000598 / 0.012972 -- [instance.c:ompi_mpi_instance_init_common:initialization]: 0.000991 / 0.000924 / 0.001064 -- [instance.c:ompi_mpi_instance_init_common:ompi_rte_init]: 0.007519 / 0.004406 / 0.016369 -- [instance.c:ompi_mpi_instance_init_common:PMIx_Commit]: 0.003164 / 0.002496 / 0.003640 -- [instance.c:ompi_mpi_instance_init_common:pmix-barrier-1]: 0.007725 / 0.000072 / 0.010423 -- [instance.c:ompi_mpi_instance_init_common:pmix-barrier-2]: 0.000138 / 0.000068 / 0.000159 -- [instance.c:ompi_mpi_instance_init_common:modex]: 0.000181 / 0.000115 / 0.000333 -- [instance.c:ompi_mpi_instance_init_common:modex-barrier]: 0.003143 / 0.002944 / 0.003308 -- [instance.c:ompi_mpi_instance_init_common:barrier]: 0.000373 / 0.000161 / 0.000618 !! [instance.c:ompi_mpi_instance_init_common:total]: 0.023234 / 0.011186 / 0.035914 [ompi_mpi_init.c:ompi_mpi_init:barrier-finish]: 0.023557 / 0.023051 / 0.024240 [ompi_mpi_init:total] 0.023557 / 0.023051 / 0.024240 [ompi_mpi_init:overhead]: 0.000240 The timing points can be refined by others depending on their needs. Related to #11166 Signed-off-by: Howard Pritchard --- ompi/instance/instance.c | 62 +++++++----------- ompi/runtime/ompi_mpi_init.c | 122 ++--------------------------------- 2 files changed, 29 insertions(+), 155 deletions(-) diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index 2d669ce8c4e..e146e3d42e0 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2018-2022 Triad National Security, LLC. All rights + * Copyright (c) 2018-2023 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2022 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -18,6 +18,14 @@ #include "opal/util/show_help.h" #include "opal/util/argv.h" #include "opal/runtime/opal_params.h" +#include "opal/util/timings.h" +#include "opal/mca/allocator/base/base.h" +#include "opal/mca/rcache/base/base.h" +#include "opal/mca/mpool/base/base.h" +#include "opal/mca/smsc/base/base.h" +#include "opal/mca/mpool/base/mpool_base_tree.h" +#include "opal/mca/pmix/pmix-internal.h" +#include "opal/mca/pmix/base/base.h" #include "ompi/mca/pml/pml.h" #include "ompi/runtime/params.h" @@ -33,13 +41,10 @@ #include "ompi/dpm/dpm.h" #include "ompi/file/file.h" #include "ompi/mpiext/mpiext.h" +#include "ompi/util/timings.h" #include "ompi/mca/hook/base/base.h" #include "ompi/mca/op/base/base.h" -#include "opal/mca/allocator/base/base.h" -#include "opal/mca/rcache/base/base.h" -#include "opal/mca/mpool/base/base.h" -#include "opal/mca/smsc/base/base.h" #include "ompi/mca/bml/base/base.h" #include "ompi/mca/pml/base/base.h" #include "ompi/mca/coll/base/base.h" @@ -47,12 +52,8 @@ #include "ompi/mca/part/base/base.h" #include "ompi/mca/io/base/base.h" #include "ompi/mca/topo/base/base.h" -#include "opal/mca/pmix/base/base.h" -#include "opal/mca/mpool/base/mpool_base_tree.h" #include "ompi/mca/pml/base/pml_base_bsend.h" -#include "ompi/util/timings.h" -#include "opal/mca/pmix/pmix-internal.h" ompi_predefined_instance_t ompi_mpi_instance_null = {{{{0}}}}; @@ -341,7 +342,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) pmix_info_t info[2]; pmix_status_t rc; opal_pmix_lock_t mylock; - OMPI_TIMING_INIT(64); + + OPAL_TIMING_ENV_INIT(init_common); ret = ompi_mpi_instance_retain (); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -382,13 +384,15 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL); } - OMPI_TIMING_NEXT("initialization"); + OPAL_TIMING_ENV_NEXT(init_common, "initialization"); /* Setup RTE */ if (OMPI_SUCCESS != (ret = ompi_rte_init (&argc, &argv))) { return ompi_instance_print_error ("ompi_mpi_init: ompi_rte_init failed", ret); } + OPAL_TIMING_ENV_NEXT(init_common, "ompi_rte_init"); + /* open the ompi hook framework */ for (int i = 0 ; ompi_framework_dependencies[i] ; ++i) { ret = mca_base_framework_open (ompi_framework_dependencies[i], 0); @@ -401,10 +405,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) } } - OMPI_TIMING_NEXT("rte_init"); - OMPI_TIMING_IMPORT_OPAL("orte_ess_base_app_setup"); - OMPI_TIMING_IMPORT_OPAL("rte_init"); - ompi_rte_initialized = true; /* if we are oversubscribed, then set yield_when_idle * accordingly */ @@ -507,9 +507,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ompi_instance_print_error ("mca_pml_base_select() failed", ret); } - OMPI_TIMING_IMPORT_OPAL("orte_init"); - OMPI_TIMING_NEXT("rte_init-commit"); - /* exchange connection info - this function may also act as a barrier * if data exchange is required. The modex occurs solely across procs * in our job. If a barrier is required, the "modex" function will @@ -520,7 +517,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ret; /* TODO: need to fix this */ } - OMPI_TIMING_NEXT("commit"); + OPAL_TIMING_ENV_NEXT(init_common, "PMIx_Commit"); + #if (OPAL_ENABLE_TIMING) if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex && opal_pmix_collect_all_data && !opal_process_info.is_singleton) { @@ -528,11 +526,11 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) ret = opal_pmix_convert_status(rc); return ompi_instance_print_error ("timing: pmix-barrier-1 failed", ret); } - OMPI_TIMING_NEXT("pmix-barrier-1"); + OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-1"); if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) { return ompi_instance_print_error ("timing: pmix-barrier-2 failed", ret); } - OMPI_TIMING_NEXT("pmix-barrier-2"); + OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-2"); } #endif @@ -577,7 +575,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) } } - OMPI_TIMING_NEXT("modex"); + OPAL_TIMING_ENV_NEXT(init_common, "modex"); /* select buffered send allocator component to be used */ if (OMPI_SUCCESS != (ret = mca_pml_base_bsend_init ())) { @@ -625,14 +623,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ompi_instance_print_error ("ompi_attr_create_predefined_keyvals() failed", ret); } - if (mca_pml_base_requires_world ()) { - /* need to set up comm world for this instance -- XXX -- FIXME -- probably won't always - * be the case. */ - if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) { - return ompi_instance_print_error ("ompi_comm_init_mpi3 () failed", ret); - } - } - /* initialize file handles */ if (OMPI_SUCCESS != (ret = ompi_file_init ())) { return ompi_instance_print_error ("ompi_file_init() failed", ret); @@ -709,11 +699,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ompi_instance_print_error ("ompi_mpi_init: ompi_comm_cid_init failed", ret); } - /* Do we need to wait for a debugger? */ - ompi_rte_wait_for_debugger(); - /* Next timing measurement */ - OMPI_TIMING_NEXT("modex-barrier"); + OPAL_TIMING_ENV_NEXT(init_common, "modex-barrier"); if (!opal_process_info.is_singleton) { /* if we executed the above fence in the background, then @@ -738,9 +725,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) } } - /* check for timing request - get stop time and report elapsed - time if so, then start the clock again */ - OMPI_TIMING_NEXT("barrier"); + OPAL_TIMING_ENV_NEXT(init_common, "barrier"); #if OPAL_ENABLE_PROGRESS_THREADS == 0 /* Start setting up the event engine for MPI operations. Don't @@ -749,7 +734,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) CPU utilization for the remainder of MPI_INIT when we are blocking on RTE-level events, but may greatly reduce non-TCP latency. */ - opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK); + int old_event_flags = opal_progress_set_event_flag(0); + opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK); #endif /* Undo OPAL calling opal_progress_event_users_increment() during diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 19c0999d163..62a9eb293ca 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -27,7 +27,7 @@ * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. * All Rights reserved. * Copyright (c) 2021 Nanook Consulting. All rights reserved. - * Copyright (c) 2021-2022 Triad National Security, LLC. All rights + * Copyright (c) 2021-2023 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -291,14 +291,6 @@ void ompi_mpi_thread_level(int requested, int *provided) MPI_THREAD_MULTIPLE); } -static void fence_release(pmix_status_t status, void *cbdata) -{ - volatile bool *active = (volatile bool*)cbdata; - OPAL_ACQUIRE_OBJECT(active); - *active = false; - OPAL_POST_OBJECT(active); -} - int ompi_mpi_init(int argc, char **argv, int requested, int *provided, bool reinit_ok) { @@ -307,9 +299,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, #if OPAL_USING_INTERNAL_PMIX char *evar; #endif - volatile bool active; - bool background_fence = false; - pmix_info_t info[2]; pmix_status_t rc; OMPI_TIMING_INIT(64); @@ -392,69 +381,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, free(tmp); } -#if (OPAL_ENABLE_TIMING) - if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex && - opal_pmix_collect_all_data && !opal_process_info.is_singleton) { - if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) { - ret = opal_pmix_convert_status(rc); - error = "timing: pmix-barrier-1 failed"; - goto error; - } - OMPI_TIMING_NEXT("pmix-barrier-1"); - if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) { - ret = opal_pmix_convert_status(rc); - error = "timing: pmix-barrier-2 failed"; - goto error; - } - OMPI_TIMING_NEXT("pmix-barrier-2"); - } -#endif - - if (!opal_process_info.is_singleton) { - if (opal_pmix_base_async_modex) { - /* if we are doing an async modex, but we are collecting all - * data, then execute the non-blocking modex in the background. - * All calls to modex_recv will be cached until the background - * modex completes. If collect_all_data is false, then we skip - * the fence completely and retrieve data on-demand from the - * source node. - */ - if (opal_pmix_collect_all_data) { - /* execute the fence_nb in the background to collect - * the data */ - background_fence = true; - active = true; - OPAL_POST_OBJECT(&active); - PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL); - if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0, - fence_release, - (void*)&active))) { - ret = opal_pmix_convert_status(rc); - error = "PMIx_Fence_nb() failed"; - goto error; - } - } - } else { - /* we want to do the modex - we block at this point, but we must - * do so in a manner that allows us to call opal_progress so our - * event library can be cycled as we have tied PMIx to that - * event base */ - active = true; - OPAL_POST_OBJECT(&active); - PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL); - rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active); - if( PMIX_SUCCESS != rc) { - ret = opal_pmix_convert_status(rc); - error = "PMIx_Fence() failed"; - goto error; - } - /* cannot just wait on thread as we need to call opal_progress */ - OMPI_LAZY_WAIT_FOR_COMPLETION(active); - } - } - - OMPI_TIMING_NEXT("modex"); - MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm)); MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm)); @@ -491,48 +417,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, /* Do we need to wait for a debugger? */ ompi_rte_wait_for_debugger(); - /* Next timing measurement */ - OMPI_TIMING_NEXT("modex-barrier"); - - if (!opal_process_info.is_singleton) { - /* if we executed the above fence in the background, then - * we have to wait here for it to complete. However, there - * is no reason to do two barriers! */ - if (background_fence) { - OMPI_LAZY_WAIT_FOR_COMPLETION(active); - } else if (!ompi_async_mpi_init) { - /* wait for everyone to reach this point - this is a hard - * barrier requirement at this time, though we hope to relax - * it at a later point */ - bool flag = false; - active = true; - OPAL_POST_OBJECT(&active); - PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL); - if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1, - fence_release, (void*)&active))) { - ret = opal_pmix_convert_status(rc); - error = "PMIx_Fence_nb() failed"; - goto error; - } - OMPI_LAZY_WAIT_FOR_COMPLETION(active); - } - } - - /* check for timing request - get stop time and report elapsed - time if so, then start the clock again */ - OMPI_TIMING_NEXT("barrier"); - -#if OPAL_ENABLE_PROGRESS_THREADS == 0 - /* Start setting up the event engine for MPI operations. Don't - block in the event library, so that communications don't take - forever between procs in the dynamic code. This will increase - CPU utilization for the remainder of MPI_INIT when we are - blocking on RTE-level events, but may greatly reduce non-TCP - latency. */ - int old_event_flags = opal_progress_set_event_flag(0); - opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK); -#endif - /* wire up the mpi interface, if requested. Do this after the non-block switch for non-TCP performance. Do before the polling change as anyone with a complex wire-up is going to be @@ -592,6 +476,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, opal_atomic_wmb(); opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_COMPLETED); + OMPI_TIMING_IMPORT_OPAL("opal_init_util"); + OMPI_TIMING_IMPORT_OPAL("opal_init"); + OMPI_TIMING_IMPORT_OPAL("ompi_mpi_instance_init_common"); + /* Finish last measurement, output results * and clear timing structure */ OMPI_TIMING_NEXT("barrier-finish"); From 712f0e4fe981ff071954217780356ba25eb6e2e1 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Thu, 26 Jan 2023 16:22:46 -0700 Subject: [PATCH 2/2] test moving mpi3 comm_world init Signed-off-by: Howard Pritchard --- ompi/instance/instance.c | 4 ++++ ompi/runtime/ompi_mpi_init.c | 6 ------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index e146e3d42e0..87a33212d66 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -699,6 +699,10 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ompi_instance_print_error ("ompi_mpi_init: ompi_comm_cid_init failed", ret); } + if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) { + return ompi_instance_print_error ("ompi_comm_init_mpi3 () failed", ret); + } + /* Next timing measurement */ OPAL_TIMING_ENV_NEXT(init_common, "modex-barrier"); diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 62a9eb293ca..39e34665a13 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -353,12 +353,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, ompi_hook_base_mpi_init_top_post_opal(argc, argv, requested, provided); - /* initialize communicator subsystem */ - if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) { - error = "ompi_mpi_init: ompi_comm_init_mpi3 failed"; - goto error; - } - /* Bozo argument check */ if (NULL == argv && argc > 1) { ret = OMPI_ERR_BAD_PARAM;