Skip to content

Commit 0861884

Browse files
committed
ompi/mpi_init: fix barrier
Relax CPU usage pressure from the application processes when doing modex and barrier in ompi_mpi_init. We see significant latencies in SLURM/pmix plugin barrier progress because app processes are aggressively call opal_progress pushing away daemon process doing collective progress.
1 parent eae9d31 commit 0861884

File tree

2 files changed

+26
-3
lines changed

2 files changed

+26
-3
lines changed

ompi/runtime/ompi_mpi_init.c

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ opal_list_t ompi_registered_datareps = {{0}};
280280

281281
bool ompi_enable_timing = false, ompi_enable_timing_ext = false;
282282
extern bool ompi_mpi_yield_when_idle;
283+
extern bool ompi_mpi_lazy_wait_in_init;
283284
extern int ompi_mpi_event_tick_rate;
284285

285286
/**
@@ -532,7 +533,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
532533
opal_pmix.register_evhandler(NULL, &info, ompi_errhandler_callback,
533534
ompi_errhandler_registration_callback,
534535
(void*)&errtrk);
535-
OMPI_WAIT_FOR_COMPLETION(errtrk.active);
536+
if( ompi_mpi_lazy_wait_in_init ){
537+
OMPI_LAZY_WAIT_FOR_COMPLETION(errtrk.active);
538+
} else {
539+
OMPI_WAIT_FOR_COMPLETION(errtrk.active);
540+
}
541+
536542
OPAL_LIST_DESTRUCT(&info);
537543
if (OPAL_SUCCESS != errtrk.status) {
538544
error = "Error handler registration";
@@ -658,7 +664,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
658664
if (NULL != opal_pmix.fence_nb) {
659665
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
660666
fence_release, (void*)&active);
661-
OMPI_WAIT_FOR_COMPLETION(active);
667+
if( ompi_mpi_lazy_wait_in_init ){
668+
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
669+
} else {
670+
OMPI_WAIT_FOR_COMPLETION(active);
671+
}
662672
} else {
663673
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
664674
}
@@ -835,7 +845,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
835845
if (NULL != opal_pmix.fence_nb) {
836846
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
837847
fence_release, (void*)&active);
838-
OMPI_WAIT_FOR_COMPLETION(active);
848+
if( ompi_mpi_lazy_wait_in_init ){
849+
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
850+
} else {
851+
OMPI_WAIT_FOR_COMPLETION(active);
852+
}
839853
} else {
840854
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
841855
}

ompi/runtime/ompi_mpi_params.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
6060
bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
6161

6262
bool ompi_mpi_yield_when_idle = true;
63+
bool ompi_mpi_lazy_wait_in_init = false;
6364
int ompi_mpi_event_tick_rate = -1;
6465
char *ompi_mpi_show_mca_params_string = NULL;
6566
bool ompi_mpi_have_sparse_group_storage = !!(OMPI_GROUP_SPARSE);
@@ -112,6 +113,14 @@ int ompi_mpi_register_params(void)
112113
MCA_BASE_VAR_SCOPE_READONLY,
113114
&ompi_mpi_yield_when_idle);
114115

116+
ompi_mpi_lazy_wait_in_init = false;
117+
(void) mca_base_var_register("ompi", "mpi", NULL, "lazy_wait_in_init",
118+
"Avoid aggressive progress in MPI_Init, make sure that PMIx server has timeslots to progress",
119+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
120+
OPAL_INFO_LVL_9,
121+
MCA_BASE_VAR_SCOPE_READONLY,
122+
&ompi_mpi_lazy_wait_in_init);
123+
115124
ompi_mpi_event_tick_rate = -1;
116125
(void) mca_base_var_register("ompi", "mpi", NULL, "event_tick_rate",
117126
"How often to progress TCP communications (0 = never, otherwise specified in microseconds)",

0 commit comments

Comments
 (0)