Skip to content

Commit b3ae758

Browse files
committed
PMIx_Fences - remove unneeded ones during MPI initialization
This patch removes redundant PMIx Fences in the initialization procedure for MPI when using the World Process Model (WPM). See chapter 11 sections 2 and 3 of the MPI-4 standard for a discussion of the WPM and new Sessions model. The patch does, however, require that what should have been a local operation to support initialization of an MPI session, into a global one. Note this does not disable the sessions feature but just restricts when it will work at this point to use cases that are similar to MPI initialization using the WPM. Refactoring to make ompi_mpi_instance_init_common purely local will require changes that would be too impactive for the current state of the 5.0.0 release cycle. See issue #11239. This patch also fixes up the timings reported when building using the timing infrastructure: mpirun -np 8 ./ring_c ------------------ ompi_mpi_init ------------------ -- [opal_init_core.c:opal_init_util:opal_malloc_init]: 0.000031 / 0.000023 / 0.000043 -- [opal_init_core.c:opal_init_util:opal_show_help_init]: 0.000094 / 0.000085 / 0.000108 -- [opal_init_core.c:opal_init_util:opal_var_init]: 0.000002 / 0.000001 / 0.000003 -- [opal_init_core.c:opal_init_util:opal_var_cache]: 0.000399 / 0.000345 / 0.000442 -- [opal_init_core.c:opal_init_util:opal_arch_init]: 0.000057 / 0.000054 / 0.000065 -- [opal_init_core.c:opal_init_util:mca_base_open]: 0.000201 / 0.000178 / 0.000243 !! [opal_init_core.c:opal_init_util:total]: 0.000784 / 0.000686 / 0.000904 -- [opal_init.c:opal_init:opal_if_init]: 0.000074 / 0.000062 / 0.000084 -- [opal_init.c:opal_init:opal_init_psm]: 0.000010 / 0.000009 / 0.000011 -- [opal_init.c:opal_init:opal_net_init]: 0.000010 / 0.000008 / 0.000012 -- [opal_init.c:opal_init:opal_datatype_init]: 0.003596 / 0.000519 / 0.012865 !! [opal_init.c:opal_init:total]: 0.003689 / 0.000598 / 0.012972 -- [instance.c:ompi_mpi_instance_init_common:initialization]: 0.000991 / 0.000924 / 0.001064 -- [instance.c:ompi_mpi_instance_init_common:ompi_rte_init]: 0.007519 / 0.004406 / 0.016369 -- [instance.c:ompi_mpi_instance_init_common:PMIx_Commit]: 0.003164 / 0.002496 / 0.003640 -- [instance.c:ompi_mpi_instance_init_common:pmix-barrier-1]: 0.007725 / 0.000072 / 0.010423 -- [instance.c:ompi_mpi_instance_init_common:pmix-barrier-2]: 0.000138 / 0.000068 / 0.000159 -- [instance.c:ompi_mpi_instance_init_common:modex]: 0.000181 / 0.000115 / 0.000333 -- [instance.c:ompi_mpi_instance_init_common:modex-barrier]: 0.003143 / 0.002944 / 0.003308 -- [instance.c:ompi_mpi_instance_init_common:barrier]: 0.000373 / 0.000161 / 0.000618 !! [instance.c:ompi_mpi_instance_init_common:total]: 0.023234 / 0.011186 / 0.035914 [ompi_mpi_init.c:ompi_mpi_init:barrier-finish]: 0.023557 / 0.023051 / 0.024240 [ompi_mpi_init:total] 0.023557 / 0.023051 / 0.024240 [ompi_mpi_init:overhead]: 0.000240 The timing points can be refined by others depending on their needs. Related to #11166 Signed-off-by: Howard Pritchard <[email protected]>
1 parent dd6b875 commit b3ae758

File tree

2 files changed

+29
-155
lines changed

2 files changed

+29
-155
lines changed

ompi/instance/instance.c

Lines changed: 24 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
3+
* Copyright (c) 2018-2023 Triad National Security, LLC. All rights
44
* reserved.
55
* Copyright (c) 2022 Cisco Systems, Inc. All rights reserved.
66
* $COPYRIGHT$
@@ -18,6 +18,14 @@
1818
#include "opal/util/show_help.h"
1919
#include "opal/util/argv.h"
2020
#include "opal/runtime/opal_params.h"
21+
#include "opal/util/timings.h"
22+
#include "opal/mca/allocator/base/base.h"
23+
#include "opal/mca/rcache/base/base.h"
24+
#include "opal/mca/mpool/base/base.h"
25+
#include "opal/mca/smsc/base/base.h"
26+
#include "opal/mca/mpool/base/mpool_base_tree.h"
27+
#include "opal/mca/pmix/pmix-internal.h"
28+
#include "opal/mca/pmix/base/base.h"
2129

2230
#include "ompi/mca/pml/pml.h"
2331
#include "ompi/runtime/params.h"
@@ -33,26 +41,19 @@
3341
#include "ompi/dpm/dpm.h"
3442
#include "ompi/file/file.h"
3543
#include "ompi/mpiext/mpiext.h"
44+
#include "ompi/util/timings.h"
3645

3746
#include "ompi/mca/hook/base/base.h"
3847
#include "ompi/mca/op/base/base.h"
39-
#include "opal/mca/allocator/base/base.h"
40-
#include "opal/mca/rcache/base/base.h"
41-
#include "opal/mca/mpool/base/base.h"
42-
#include "opal/mca/smsc/base/base.h"
4348
#include "ompi/mca/bml/base/base.h"
4449
#include "ompi/mca/pml/base/base.h"
4550
#include "ompi/mca/coll/base/base.h"
4651
#include "ompi/mca/osc/base/base.h"
4752
#include "ompi/mca/part/base/base.h"
4853
#include "ompi/mca/io/base/base.h"
4954
#include "ompi/mca/topo/base/base.h"
50-
#include "opal/mca/pmix/base/base.h"
5155

52-
#include "opal/mca/mpool/base/mpool_base_tree.h"
5356
#include "ompi/mca/pml/base/pml_base_bsend.h"
54-
#include "ompi/util/timings.h"
55-
#include "opal/mca/pmix/pmix-internal.h"
5657

5758
ompi_predefined_instance_t ompi_mpi_instance_null = {{{{0}}}};
5859

@@ -341,7 +342,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
341342
pmix_info_t info[2];
342343
pmix_status_t rc;
343344
opal_pmix_lock_t mylock;
344-
OMPI_TIMING_INIT(64);
345+
346+
OPAL_TIMING_ENV_INIT(init_common);
345347

346348
ret = ompi_mpi_instance_retain ();
347349
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
@@ -382,13 +384,15 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
382384
mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
383385
}
384386

385-
OMPI_TIMING_NEXT("initialization");
387+
OPAL_TIMING_ENV_NEXT(init_common, "initialization");
386388

387389
/* Setup RTE */
388390
if (OMPI_SUCCESS != (ret = ompi_rte_init (&argc, &argv))) {
389391
return ompi_instance_print_error ("ompi_mpi_init: ompi_rte_init failed", ret);
390392
}
391393

394+
OPAL_TIMING_ENV_NEXT(init_common, "ompi_rte_init");
395+
392396
/* open the ompi hook framework */
393397
for (int i = 0 ; ompi_framework_dependencies[i] ; ++i) {
394398
ret = mca_base_framework_open (ompi_framework_dependencies[i], 0);
@@ -401,10 +405,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
401405
}
402406
}
403407

404-
OMPI_TIMING_NEXT("rte_init");
405-
OMPI_TIMING_IMPORT_OPAL("orte_ess_base_app_setup");
406-
OMPI_TIMING_IMPORT_OPAL("rte_init");
407-
408408
ompi_rte_initialized = true;
409409
/* if we are oversubscribed, then set yield_when_idle
410410
* accordingly */
@@ -507,9 +507,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
507507
return ompi_instance_print_error ("mca_pml_base_select() failed", ret);
508508
}
509509

510-
OMPI_TIMING_IMPORT_OPAL("orte_init");
511-
OMPI_TIMING_NEXT("rte_init-commit");
512-
513510
/* exchange connection info - this function may also act as a barrier
514511
* if data exchange is required. The modex occurs solely across procs
515512
* in our job. If a barrier is required, the "modex" function will
@@ -520,19 +517,20 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
520517
return ret; /* TODO: need to fix this */
521518
}
522519

523-
OMPI_TIMING_NEXT("commit");
520+
OPAL_TIMING_ENV_NEXT(init_common, "PMIx_Commit");
521+
524522
#if (OPAL_ENABLE_TIMING)
525523
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
526524
opal_pmix_collect_all_data && !opal_process_info.is_singleton) {
527525
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
528526
ret = opal_pmix_convert_status(rc);
529527
return ompi_instance_print_error ("timing: pmix-barrier-1 failed", ret);
530528
}
531-
OMPI_TIMING_NEXT("pmix-barrier-1");
529+
OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-1");
532530
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
533531
return ompi_instance_print_error ("timing: pmix-barrier-2 failed", ret);
534532
}
535-
OMPI_TIMING_NEXT("pmix-barrier-2");
533+
OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-2");
536534
}
537535
#endif
538536

@@ -577,7 +575,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
577575
}
578576
}
579577

580-
OMPI_TIMING_NEXT("modex");
578+
OPAL_TIMING_ENV_NEXT(init_common, "modex");
581579

582580
/* select buffered send allocator component to be used */
583581
if (OMPI_SUCCESS != (ret = mca_pml_base_bsend_init ())) {
@@ -625,14 +623,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
625623
return ompi_instance_print_error ("ompi_attr_create_predefined_keyvals() failed", ret);
626624
}
627625

628-
if (mca_pml_base_requires_world ()) {
629-
/* need to set up comm world for this instance -- XXX -- FIXME -- probably won't always
630-
* be the case. */
631-
if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) {
632-
return ompi_instance_print_error ("ompi_comm_init_mpi3 () failed", ret);
633-
}
634-
}
635-
636626
/* initialize file handles */
637627
if (OMPI_SUCCESS != (ret = ompi_file_init ())) {
638628
return ompi_instance_print_error ("ompi_file_init() failed", ret);
@@ -709,11 +699,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
709699
return ompi_instance_print_error ("ompi_mpi_init: ompi_comm_cid_init failed", ret);
710700
}
711701

712-
/* Do we need to wait for a debugger? */
713-
ompi_rte_wait_for_debugger();
714-
715702
/* Next timing measurement */
716-
OMPI_TIMING_NEXT("modex-barrier");
703+
OPAL_TIMING_ENV_NEXT(init_common, "modex-barrier");
717704

718705
if (!opal_process_info.is_singleton) {
719706
/* if we executed the above fence in the background, then
@@ -738,9 +725,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
738725
}
739726
}
740727

741-
/* check for timing request - get stop time and report elapsed
742-
time if so, then start the clock again */
743-
OMPI_TIMING_NEXT("barrier");
728+
OPAL_TIMING_ENV_NEXT(init_common, "barrier");
744729

745730
#if OPAL_ENABLE_PROGRESS_THREADS == 0
746731
/* Start setting up the event engine for MPI operations. Don't
@@ -749,7 +734,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
749734
CPU utilization for the remainder of MPI_INIT when we are
750735
blocking on RTE-level events, but may greatly reduce non-TCP
751736
latency. */
752-
opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
737+
int old_event_flags = opal_progress_set_event_flag(0);
738+
opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
753739
#endif
754740

755741
/* Undo OPAL calling opal_progress_event_users_increment() during

ompi/runtime/ompi_mpi_init.c

Lines changed: 5 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
2828
* All Rights reserved.
2929
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
30-
* Copyright (c) 2021-2022 Triad National Security, LLC. All rights
30+
* Copyright (c) 2021-2023 Triad National Security, LLC. All rights
3131
* reserved.
3232
* $COPYRIGHT$
3333
*
@@ -291,14 +291,6 @@ void ompi_mpi_thread_level(int requested, int *provided)
291291
MPI_THREAD_MULTIPLE);
292292
}
293293

294-
static void fence_release(pmix_status_t status, void *cbdata)
295-
{
296-
volatile bool *active = (volatile bool*)cbdata;
297-
OPAL_ACQUIRE_OBJECT(active);
298-
*active = false;
299-
OPAL_POST_OBJECT(active);
300-
}
301-
302294
int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
303295
bool reinit_ok)
304296
{
@@ -307,9 +299,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
307299
#if OPAL_USING_INTERNAL_PMIX
308300
char *evar;
309301
#endif
310-
volatile bool active;
311-
bool background_fence = false;
312-
pmix_info_t info[2];
313302
pmix_status_t rc;
314303
OMPI_TIMING_INIT(64);
315304

@@ -392,69 +381,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
392381
free(tmp);
393382
}
394383

395-
#if (OPAL_ENABLE_TIMING)
396-
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
397-
opal_pmix_collect_all_data && !opal_process_info.is_singleton) {
398-
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
399-
ret = opal_pmix_convert_status(rc);
400-
error = "timing: pmix-barrier-1 failed";
401-
goto error;
402-
}
403-
OMPI_TIMING_NEXT("pmix-barrier-1");
404-
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
405-
ret = opal_pmix_convert_status(rc);
406-
error = "timing: pmix-barrier-2 failed";
407-
goto error;
408-
}
409-
OMPI_TIMING_NEXT("pmix-barrier-2");
410-
}
411-
#endif
412-
413-
if (!opal_process_info.is_singleton) {
414-
if (opal_pmix_base_async_modex) {
415-
/* if we are doing an async modex, but we are collecting all
416-
* data, then execute the non-blocking modex in the background.
417-
* All calls to modex_recv will be cached until the background
418-
* modex completes. If collect_all_data is false, then we skip
419-
* the fence completely and retrieve data on-demand from the
420-
* source node.
421-
*/
422-
if (opal_pmix_collect_all_data) {
423-
/* execute the fence_nb in the background to collect
424-
* the data */
425-
background_fence = true;
426-
active = true;
427-
OPAL_POST_OBJECT(&active);
428-
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
429-
if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0,
430-
fence_release,
431-
(void*)&active))) {
432-
ret = opal_pmix_convert_status(rc);
433-
error = "PMIx_Fence_nb() failed";
434-
goto error;
435-
}
436-
}
437-
} else {
438-
/* we want to do the modex - we block at this point, but we must
439-
* do so in a manner that allows us to call opal_progress so our
440-
* event library can be cycled as we have tied PMIx to that
441-
* event base */
442-
active = true;
443-
OPAL_POST_OBJECT(&active);
444-
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
445-
rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
446-
if( PMIX_SUCCESS != rc) {
447-
ret = opal_pmix_convert_status(rc);
448-
error = "PMIx_Fence() failed";
449-
goto error;
450-
}
451-
/* cannot just wait on thread as we need to call opal_progress */
452-
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
453-
}
454-
}
455-
456-
OMPI_TIMING_NEXT("modex");
457-
458384
MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
459385
MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));
460386

@@ -491,48 +417,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
491417
/* Do we need to wait for a debugger? */
492418
ompi_rte_wait_for_debugger();
493419

494-
/* Next timing measurement */
495-
OMPI_TIMING_NEXT("modex-barrier");
496-
497-
if (!opal_process_info.is_singleton) {
498-
/* if we executed the above fence in the background, then
499-
* we have to wait here for it to complete. However, there
500-
* is no reason to do two barriers! */
501-
if (background_fence) {
502-
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
503-
} else if (!ompi_async_mpi_init) {
504-
/* wait for everyone to reach this point - this is a hard
505-
* barrier requirement at this time, though we hope to relax
506-
* it at a later point */
507-
bool flag = false;
508-
active = true;
509-
OPAL_POST_OBJECT(&active);
510-
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
511-
if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1,
512-
fence_release, (void*)&active))) {
513-
ret = opal_pmix_convert_status(rc);
514-
error = "PMIx_Fence_nb() failed";
515-
goto error;
516-
}
517-
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
518-
}
519-
}
520-
521-
/* check for timing request - get stop time and report elapsed
522-
time if so, then start the clock again */
523-
OMPI_TIMING_NEXT("barrier");
524-
525-
#if OPAL_ENABLE_PROGRESS_THREADS == 0
526-
/* Start setting up the event engine for MPI operations. Don't
527-
block in the event library, so that communications don't take
528-
forever between procs in the dynamic code. This will increase
529-
CPU utilization for the remainder of MPI_INIT when we are
530-
blocking on RTE-level events, but may greatly reduce non-TCP
531-
latency. */
532-
int old_event_flags = opal_progress_set_event_flag(0);
533-
opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
534-
#endif
535-
536420
/* wire up the mpi interface, if requested. Do this after the
537421
non-block switch for non-TCP performance. Do before the
538422
polling change as anyone with a complex wire-up is going to be
@@ -592,6 +476,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
592476
opal_atomic_wmb();
593477
opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_COMPLETED);
594478

479+
OMPI_TIMING_IMPORT_OPAL("opal_init_util");
480+
OMPI_TIMING_IMPORT_OPAL("opal_init");
481+
OMPI_TIMING_IMPORT_OPAL("ompi_mpi_instance_init_common");
482+
595483
/* Finish last measurement, output results
596484
* and clear timing structure */
597485
OMPI_TIMING_NEXT("barrier-finish");

0 commit comments

Comments
 (0)