Skip to content

Commit 6ea5e84

Browse files
committed
PMIx_Fences - remove two during MPI initialization
This patch removes redundant PMIx Fences in the initialization procedure for MPI when using the World Process Model (WPM). See chapter 11 sections 2 and 3 of the MPI-4 standard for a discussion of the WPM and new Sessions model. The patch does, however, require that what should have been a local operation to support initialization of an MPI session, into a global one. Note this does not disable the sessions feature but just restricts when it will work at this point to use cases that are similar to MPI initialization using the WPM. Refactoring to make ompi_mpi_instance_init_common purely local will require changes that would be too impactive for the current state of the 5.0.0 release cycle. See issue #11239. Related to #11166 Signed-off-by: Howard Pritchard <[email protected]>
1 parent dd6b875 commit 6ea5e84

File tree

4 files changed

+40
-153
lines changed

4 files changed

+40
-153
lines changed

3rd-party/openpmix

Submodule openpmix updated 112 files

3rd-party/prrte

Submodule prrte updated 111 files

ompi/instance/instance.c

Lines changed: 26 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
3+
* Copyright (c) 2018-2023 Triad National Security, LLC. All rights
44
* reserved.
55
* Copyright (c) 2022 Cisco Systems, Inc. All rights reserved.
66
* $COPYRIGHT$
@@ -18,6 +18,14 @@
1818
#include "opal/util/show_help.h"
1919
#include "opal/util/argv.h"
2020
#include "opal/runtime/opal_params.h"
21+
#include "opal/util/timings.h"
22+
#include "opal/mca/allocator/base/base.h"
23+
#include "opal/mca/rcache/base/base.h"
24+
#include "opal/mca/mpool/base/base.h"
25+
#include "opal/mca/smsc/base/base.h"
26+
#include "opal/mca/mpool/base/mpool_base_tree.h"
27+
#include "opal/mca/pmix/pmix-internal.h"
28+
#include "opal/mca/pmix/base/base.h"
2129

2230
#include "ompi/mca/pml/pml.h"
2331
#include "ompi/runtime/params.h"
@@ -33,26 +41,19 @@
3341
#include "ompi/dpm/dpm.h"
3442
#include "ompi/file/file.h"
3543
#include "ompi/mpiext/mpiext.h"
44+
#include "ompi/util/timings.h"
3645

3746
#include "ompi/mca/hook/base/base.h"
3847
#include "ompi/mca/op/base/base.h"
39-
#include "opal/mca/allocator/base/base.h"
40-
#include "opal/mca/rcache/base/base.h"
41-
#include "opal/mca/mpool/base/base.h"
42-
#include "opal/mca/smsc/base/base.h"
4348
#include "ompi/mca/bml/base/base.h"
4449
#include "ompi/mca/pml/base/base.h"
4550
#include "ompi/mca/coll/base/base.h"
4651
#include "ompi/mca/osc/base/base.h"
4752
#include "ompi/mca/part/base/base.h"
4853
#include "ompi/mca/io/base/base.h"
4954
#include "ompi/mca/topo/base/base.h"
50-
#include "opal/mca/pmix/base/base.h"
5155

52-
#include "opal/mca/mpool/base/mpool_base_tree.h"
5356
#include "ompi/mca/pml/base/pml_base_bsend.h"
54-
#include "ompi/util/timings.h"
55-
#include "opal/mca/pmix/pmix-internal.h"
5657

5758
ompi_predefined_instance_t ompi_mpi_instance_null = {{{{0}}}};
5859

@@ -341,7 +342,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
341342
pmix_info_t info[2];
342343
pmix_status_t rc;
343344
opal_pmix_lock_t mylock;
344-
OMPI_TIMING_INIT(64);
345+
346+
OPAL_TIMING_ENV_INIT(init_common);
345347

346348
ret = ompi_mpi_instance_retain ();
347349
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
@@ -382,13 +384,15 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
382384
mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
383385
}
384386

385-
OMPI_TIMING_NEXT("initialization");
387+
OPAL_TIMING_ENV_NEXT(init_common, "initialization");
386388

387389
/* Setup RTE */
388390
if (OMPI_SUCCESS != (ret = ompi_rte_init (&argc, &argv))) {
389391
return ompi_instance_print_error ("ompi_mpi_init: ompi_rte_init failed", ret);
390392
}
391393

394+
OPAL_TIMING_ENV_NEXT(init_common, "ompi_rte_init");
395+
392396
/* open the ompi hook framework */
393397
for (int i = 0 ; ompi_framework_dependencies[i] ; ++i) {
394398
ret = mca_base_framework_open (ompi_framework_dependencies[i], 0);
@@ -401,10 +405,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
401405
}
402406
}
403407

404-
OMPI_TIMING_NEXT("rte_init");
405-
OMPI_TIMING_IMPORT_OPAL("orte_ess_base_app_setup");
406-
OMPI_TIMING_IMPORT_OPAL("rte_init");
407-
408408
ompi_rte_initialized = true;
409409
/* if we are oversubscribed, then set yield_when_idle
410410
* accordingly */
@@ -507,9 +507,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
507507
return ompi_instance_print_error ("mca_pml_base_select() failed", ret);
508508
}
509509

510-
OMPI_TIMING_IMPORT_OPAL("orte_init");
511-
OMPI_TIMING_NEXT("rte_init-commit");
512-
513510
/* exchange connection info - this function may also act as a barrier
514511
* if data exchange is required. The modex occurs solely across procs
515512
* in our job. If a barrier is required, the "modex" function will
@@ -520,19 +517,20 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
520517
return ret; /* TODO: need to fix this */
521518
}
522519

523-
OMPI_TIMING_NEXT("commit");
520+
OPAL_TIMING_ENV_NEXT(init_common, "PMIx_Commit");
521+
524522
#if (OPAL_ENABLE_TIMING)
525523
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
526524
opal_pmix_collect_all_data && !opal_process_info.is_singleton) {
527525
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
528526
ret = opal_pmix_convert_status(rc);
529527
return ompi_instance_print_error ("timing: pmix-barrier-1 failed", ret);
530528
}
531-
OMPI_TIMING_NEXT("pmix-barrier-1");
529+
OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-1");
532530
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
533531
return ompi_instance_print_error ("timing: pmix-barrier-2 failed", ret);
534532
}
535-
OMPI_TIMING_NEXT("pmix-barrier-2");
533+
OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-2");
536534
}
537535
#endif
538536

@@ -577,7 +575,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
577575
}
578576
}
579577

580-
OMPI_TIMING_NEXT("modex");
578+
OPAL_TIMING_ENV_NEXT(init_common, "modex");
581579

582580
/* select buffered send allocator component to be used */
583581
if (OMPI_SUCCESS != (ret = mca_pml_base_bsend_init ())) {
@@ -625,14 +623,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
625623
return ompi_instance_print_error ("ompi_attr_create_predefined_keyvals() failed", ret);
626624
}
627625

628-
if (mca_pml_base_requires_world ()) {
629-
/* need to set up comm world for this instance -- XXX -- FIXME -- probably won't always
630-
* be the case. */
631-
if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) {
632-
return ompi_instance_print_error ("ompi_comm_init_mpi3 () failed", ret);
633-
}
634-
}
635-
636626
/* initialize file handles */
637627
if (OMPI_SUCCESS != (ret = ompi_file_init ())) {
638628
return ompi_instance_print_error ("ompi_file_init() failed", ret);
@@ -709,47 +699,15 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
709699
return ompi_instance_print_error ("ompi_mpi_init: ompi_comm_cid_init failed", ret);
710700
}
711701

712-
/* Do we need to wait for a debugger? */
713-
ompi_rte_wait_for_debugger();
714-
715-
/* Next timing measurement */
716-
OMPI_TIMING_NEXT("modex-barrier");
717-
718-
if (!opal_process_info.is_singleton) {
719-
/* if we executed the above fence in the background, then
720-
* we have to wait here for it to complete. However, there
721-
* is no reason to do two barriers! */
722-
if (background_fence) {
723-
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
724-
} else if (!ompi_async_mpi_init) {
725-
/* wait for everyone to reach this point - this is a hard
726-
* barrier requirement at this time, though we hope to relax
727-
* it at a later point */
728-
bool flag = false;
729-
active = true;
730-
OPAL_POST_OBJECT(&active);
731-
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
732-
if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1,
733-
fence_release, (void*)&active))) {
734-
ret = opal_pmix_convert_status(rc);
735-
return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret);
736-
}
737-
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
738-
}
739-
}
740-
741-
/* check for timing request - get stop time and report elapsed
742-
time if so, then start the clock again */
743-
OMPI_TIMING_NEXT("barrier");
744-
745702
#if OPAL_ENABLE_PROGRESS_THREADS == 0
746703
/* Start setting up the event engine for MPI operations. Don't
747704
block in the event library, so that communications don't take
748705
forever between procs in the dynamic code. This will increase
749706
CPU utilization for the remainder of MPI_INIT when we are
750707
blocking on RTE-level events, but may greatly reduce non-TCP
751708
latency. */
752-
opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
709+
int old_event_flags = opal_progress_set_event_flag(0);
710+
opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
753711
#endif
754712

755713
/* Undo OPAL calling opal_progress_event_users_increment() during
@@ -791,6 +749,10 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
791749
OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
792750
opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);
793751

752+
if (background_fence) {
753+
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
754+
}
755+
794756
return OMPI_SUCCESS;
795757
}
796758

ompi/runtime/ompi_mpi_init.c

Lines changed: 12 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
2828
* All Rights reserved.
2929
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
30-
* Copyright (c) 2021-2022 Triad National Security, LLC. All rights
30+
* Copyright (c) 2021-2023 Triad National Security, LLC. All rights
3131
* reserved.
3232
* $COPYRIGHT$
3333
*
@@ -299,16 +299,16 @@ static void fence_release(pmix_status_t status, void *cbdata)
299299
OPAL_POST_OBJECT(active);
300300
}
301301

302+
302303
int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
303304
bool reinit_ok)
304305
{
305306
int ret;
306307
char *error = NULL;
308+
bool active = false;
307309
#if OPAL_USING_INTERNAL_PMIX
308310
char *evar;
309311
#endif
310-
volatile bool active;
311-
bool background_fence = false;
312312
pmix_info_t info[2];
313313
pmix_status_t rc;
314314
OMPI_TIMING_INIT(64);
@@ -392,69 +392,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
392392
free(tmp);
393393
}
394394

395-
#if (OPAL_ENABLE_TIMING)
396-
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
397-
opal_pmix_collect_all_data && !opal_process_info.is_singleton) {
398-
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
399-
ret = opal_pmix_convert_status(rc);
400-
error = "timing: pmix-barrier-1 failed";
401-
goto error;
402-
}
403-
OMPI_TIMING_NEXT("pmix-barrier-1");
404-
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
405-
ret = opal_pmix_convert_status(rc);
406-
error = "timing: pmix-barrier-2 failed";
407-
goto error;
408-
}
409-
OMPI_TIMING_NEXT("pmix-barrier-2");
410-
}
411-
#endif
412-
413-
if (!opal_process_info.is_singleton) {
414-
if (opal_pmix_base_async_modex) {
415-
/* if we are doing an async modex, but we are collecting all
416-
* data, then execute the non-blocking modex in the background.
417-
* All calls to modex_recv will be cached until the background
418-
* modex completes. If collect_all_data is false, then we skip
419-
* the fence completely and retrieve data on-demand from the
420-
* source node.
421-
*/
422-
if (opal_pmix_collect_all_data) {
423-
/* execute the fence_nb in the background to collect
424-
* the data */
425-
background_fence = true;
426-
active = true;
427-
OPAL_POST_OBJECT(&active);
428-
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
429-
if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0,
430-
fence_release,
431-
(void*)&active))) {
432-
ret = opal_pmix_convert_status(rc);
433-
error = "PMIx_Fence_nb() failed";
434-
goto error;
435-
}
436-
}
437-
} else {
438-
/* we want to do the modex - we block at this point, but we must
439-
* do so in a manner that allows us to call opal_progress so our
440-
* event library can be cycled as we have tied PMIx to that
441-
* event base */
442-
active = true;
443-
OPAL_POST_OBJECT(&active);
444-
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
445-
rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
446-
if( PMIX_SUCCESS != rc) {
447-
ret = opal_pmix_convert_status(rc);
448-
error = "PMIx_Fence() failed";
449-
goto error;
450-
}
451-
/* cannot just wait on thread as we need to call opal_progress */
452-
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
453-
}
454-
}
455-
456-
OMPI_TIMING_NEXT("modex");
457-
458395
MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
459396
MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));
460397

@@ -495,15 +432,13 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
495432
OMPI_TIMING_NEXT("modex-barrier");
496433

497434
if (!opal_process_info.is_singleton) {
498-
/* if we executed the above fence in the background, then
499-
* we have to wait here for it to complete. However, there
500-
* is no reason to do two barriers! */
501-
if (background_fence) {
502-
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
503-
} else if (!ompi_async_mpi_init) {
435+
if (!ompi_async_mpi_init) {
504436
/* wait for everyone to reach this point - this is a hard
505437
* barrier requirement at this time, though we hope to relax
506-
* it at a later point */
438+
* it at a later point. Right now at least OB1 PML needs this
439+
* PMIx_Fence to make sure state associated with adding MPI_COMM_WORLD
440+
* to the PML has been set prior to a process sending to another
441+
* using the OB1 PML. */
507442
bool flag = false;
508443
active = true;
509444
OPAL_POST_OBJECT(&active);
@@ -517,22 +452,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
517452
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
518453
}
519454
}
520-
521-
/* check for timing request - get stop time and report elapsed
522-
time if so, then start the clock again */
523455
OMPI_TIMING_NEXT("barrier");
524456

525-
#if OPAL_ENABLE_PROGRESS_THREADS == 0
526-
/* Start setting up the event engine for MPI operations. Don't
527-
block in the event library, so that communications don't take
528-
forever between procs in the dynamic code. This will increase
529-
CPU utilization for the remainder of MPI_INIT when we are
530-
blocking on RTE-level events, but may greatly reduce non-TCP
531-
latency. */
532-
int old_event_flags = opal_progress_set_event_flag(0);
533-
opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
534-
#endif
535-
536457
/* wire up the mpi interface, if requested. Do this after the
537458
non-block switch for non-TCP performance. Do before the
538459
polling change as anyone with a complex wire-up is going to be
@@ -592,6 +513,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
592513
opal_atomic_wmb();
593514
opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_COMPLETED);
594515

516+
OMPI_TIMING_IMPORT_OPAL("opal_init_util");
517+
OMPI_TIMING_IMPORT_OPAL("opal_init");
518+
OMPI_TIMING_IMPORT_OPAL("ompi_mpi_instance_init_common");
519+
595520
/* Finish last measurement, output results
596521
* and clear timing structure */
597522
OMPI_TIMING_NEXT("barrier-finish");

0 commit comments

Comments
 (0)