Skip to content

Commit f1733ee

Browse files
committed
ompi/mpi_init: fix barrier
Relax CPU usage pressure from the application processes when doing modex and barrier in ompi_mpi_init. We see significant latencies in SLURM/pmix plugin barrier progress because app processes are aggressively call opal_progress pushing away daemon process doing collective progress. (cherry-ported from 0861884) Signed-off-by: Artem Polyakov <[email protected]>
1 parent 87a79fa commit f1733ee

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

ompi/runtime/ompi_mpi_init.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ opal_list_t ompi_registered_datareps = {{0}};
274274

275275
bool ompi_enable_timing = false, ompi_enable_timing_ext = false;
276276
extern bool ompi_mpi_yield_when_idle;
277+
extern bool ompi_mpi_lazy_wait_in_init;
277278
extern int ompi_mpi_event_tick_rate;
278279

279280
/**
@@ -639,7 +640,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
639640
if (NULL != opal_pmix.fence_nb) {
640641
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
641642
fence_release, (void*)&active);
642-
OMPI_WAIT_FOR_COMPLETION(active);
643+
if( ompi_mpi_lazy_wait_in_init ){
644+
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
645+
} else {
646+
OMPI_WAIT_FOR_COMPLETION(active);
647+
}
643648
} else {
644649
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
645650
}
@@ -809,7 +814,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
809814
if (NULL != opal_pmix.fence_nb) {
810815
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
811816
fence_release, (void*)&active);
812-
OMPI_WAIT_FOR_COMPLETION(active);
817+
if( ompi_mpi_lazy_wait_in_init ){
818+
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
819+
} else {
820+
OMPI_WAIT_FOR_COMPLETION(active);
821+
}
813822
} else {
814823
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
815824
}

ompi/runtime/ompi_mpi_params.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
6060
bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
6161

6262
bool ompi_mpi_yield_when_idle = true;
63+
bool ompi_mpi_lazy_wait_in_init = false;
6364
int ompi_mpi_event_tick_rate = -1;
6465
char *ompi_mpi_show_mca_params_string = NULL;
6566
bool ompi_mpi_have_sparse_group_storage = !!(OMPI_GROUP_SPARSE);
@@ -109,6 +110,14 @@ int ompi_mpi_register_params(void)
109110
MCA_BASE_VAR_SCOPE_READONLY,
110111
&ompi_mpi_yield_when_idle);
111112

113+
ompi_mpi_lazy_wait_in_init = false;
114+
(void) mca_base_var_register("ompi", "mpi", NULL, "lazy_wait_in_init",
115+
"Avoid aggressive progress in MPI_Init, make sure that PMIx server has timeslots to progress",
116+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
117+
OPAL_INFO_LVL_9,
118+
MCA_BASE_VAR_SCOPE_READONLY,
119+
&ompi_mpi_lazy_wait_in_init);
120+
112121
ompi_mpi_event_tick_rate = -1;
113122
(void) mca_base_var_register("ompi", "mpi", NULL, "event_tick_rate",
114123
"How often to progress TCP communications (0 = never, otherwise specified in microseconds)",

0 commit comments

Comments
 (0)