Skip to content

Commit c3fe37d

Browse files
authored
Merge pull request #8408 from rhc54/cmr40/pml
v4.0.x: Update the PML selection/check logic to avoid direct modex "storms"
2 parents 6207edb + 808b476 commit c3fe37d

File tree

1 file changed

+98
-57
lines changed

1 file changed

+98
-57
lines changed

ompi/mca/pml/base/pml_base_select.c

Lines changed: 98 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212
* All rights reserved.
1313
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
16-
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
15+
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2015-2020 Cisco Systems, Inc. All rights reserved.
17+
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights
18+
* reserved.
19+
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
1720
* $COPYRIGHT$
1821
*
1922
* Additional copyrights may follow
@@ -44,8 +47,6 @@ typedef struct opened_component_t {
4447
mca_pml_base_component_t *om_component;
4548
} opened_component_t;
4649

47-
static bool modex_reqd=false;
48-
4950
/**
5051
* Function for selecting one component from all those that are
5152
* available.
@@ -59,7 +60,7 @@ static bool modex_reqd=false;
5960
int mca_pml_base_select(bool enable_progress_threads,
6061
bool enable_mpi_threads)
6162
{
62-
int i, priority = 0, best_priority = 0, num_pml = 0;
63+
int i, priority = 0, best_priority = 0, num_pml = 0, ret = 0;
6364
opal_list_item_t *item = NULL;
6465
mca_base_component_list_item_t *cli = NULL;
6566
mca_pml_base_component_t *component = NULL, *best_component = NULL;
@@ -186,12 +187,13 @@ int mca_pml_base_select(bool enable_progress_threads,
186187
"selected %s best priority %d\n",
187188
best_component->pmlm_version.mca_component_name, best_priority);
188189

189-
/* if more than one PML could be considered, then we still need the
190-
* modex since we cannot know which one will be selected on all procs
191-
*/
192-
if (1 < num_pml) {
193-
modex_reqd = true;
194-
}
190+
/* Save the winner */
191+
192+
mca_pml_base_selected_component = *best_component;
193+
mca_pml = *best_module;
194+
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
195+
"select: component %s selected",
196+
mca_pml_base_selected_component.pmlm_version.mca_component_name );
195197

196198
/* Finalize all non-selected components */
197199

@@ -239,14 +241,6 @@ int mca_pml_base_select(bool enable_progress_threads,
239241
}
240242
#endif
241243

242-
/* Save the winner */
243-
244-
mca_pml_base_selected_component = *best_component;
245-
mca_pml = *best_module;
246-
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
247-
"select: component %s selected",
248-
mca_pml_base_selected_component.pmlm_version.mca_component_name );
249-
250244
/* This base function closes, unloads, and removes from the
251245
available list all unselected components. The available list will
252246
contain only the selected component. */
@@ -287,13 +281,11 @@ int mca_pml_base_select(bool enable_progress_threads,
287281
}
288282

289283
/* register winner in the modex */
290-
if (modex_reqd && 0 == OMPI_PROC_MY_NAME->vpid) {
291-
mca_pml_base_pml_selected(best_component->pmlm_version.mca_component_name);
292-
}
284+
ret = mca_pml_base_pml_selected(best_component->pmlm_version.mca_component_name);
293285

294286
/* All done */
295287

296-
return OMPI_SUCCESS;
288+
return ret;
297289
}
298290

299291
/* need a "commonly" named PML structure so everything ends up in the
@@ -307,49 +299,56 @@ static mca_base_component_t pml_base_component = {
307299
};
308300

309301

302+
/*
303+
* If direct modex, then publish PML for all procs. If full modex then
304+
* publish PML for rank 0 only. This information is used during add_procs
305+
* to perform PML check.
306+
* During PML check, for direct modex, compare our PML with the peer's
307+
* PML for all procs in the add_procs call. This does not change the
308+
* connection complexity of modex transfers, since adding the proc is
309+
* going to get the peer information in the MTL/PML/BTL anyway.
310+
* For full modex, compare our PML with rank 0.
311+
* Direct Modex is performed when collect_all_data is false, as we do
312+
* not perform a fence operation during MPI_Init if async_modex is true.
313+
* If async_modex is false and collect_all_data is false then we do a
314+
* zero-byte barrier and we would still require direct modex during
315+
* add_procs
316+
*/
310317
int
311318
mca_pml_base_pml_selected(const char *name)
312319
{
313-
int rc;
320+
int rc = 0;
321+
322+
if (!opal_pmix_collect_all_data || 0 == OMPI_PROC_MY_NAME->vpid) {
323+
OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, &pml_base_component, name,
324+
strlen(name) + 1);
325+
}
314326

315-
OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, &pml_base_component, name, strlen(name) + 1);
316327
return rc;
317328
}
318329

319-
int
320-
mca_pml_base_pml_check_selected(const char *my_pml,
321-
ompi_proc_t **procs,
322-
size_t nprocs)
330+
static int
331+
mca_pml_base_pml_check_selected_impl(const char *my_pml,
332+
opal_process_name_t proc_name)
323333
{
324334
size_t size;
325-
int ret;
335+
int ret = 0;
326336
char *remote_pml;
327337

328-
/* if no modex was required by the PML, then
329-
* we can assume success
330-
*/
331-
if (!modex_reqd) {
338+
/* if we are proc_name=OMPI_PROC_MY_NAME, then we can also assume success */
339+
if (0 == opal_compare_proc(ompi_proc_local()->super.proc_name, proc_name)) {
332340
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
333-
"check:select: modex not reqd");
341+
"check:select: PML check not necessary on self");
334342
return OMPI_SUCCESS;
335343
}
336-
337-
/* if we are rank=0, then we can also assume success */
338-
if (0 == OMPI_PROC_MY_NAME->vpid) {
344+
OPAL_MODEX_RECV_STRING(ret,
345+
mca_base_component_to_string(&pml_base_component),
346+
&proc_name, (void**) &remote_pml, &size);
347+
if (OPAL_ERR_NOT_FOUND == ret) {
339348
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
340-
"check:select: rank=0");
341-
return OMPI_SUCCESS;
342-
}
343-
344-
/* get the name of the PML module selected by rank=0 */
345-
OPAL_MODEX_RECV(ret, &pml_base_component,
346-
&procs[0]->super.proc_name, (void**) &remote_pml, &size);
347-
348-
/* if this key wasn't found, then just assume all is well... */
349-
if (OMPI_SUCCESS != ret) {
350-
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
351-
"check:select: modex data not found");
352-
return OMPI_SUCCESS;
349+
"check:select: PML modex for process %s not found",
350+
OMPI_NAME_PRINT(&proc_name));
351+
return OMPI_ERR_NOT_FOUND;
353352
}
354353

355354
/* the remote pml returned should never be NULL if an error
@@ -358,26 +357,68 @@ mca_pml_base_pml_check_selected(const char *my_pml,
358357
*/
359358
if (NULL == remote_pml) {
360359
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
361-
"check:select: got a NULL pml from rank=0");
360+
"check:select: got a NULL pml from process %s",
361+
OMPI_NAME_PRINT(&proc_name));
362362
return OMPI_ERR_UNREACH;
363363
}
364364

365365
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
366-
"check:select: checking my pml %s against rank=0 pml %s",
367-
my_pml, remote_pml);
366+
"check:select: checking my pml %s against process %s"
367+
" pml %s", my_pml, OMPI_NAME_PRINT(&proc_name),
368+
remote_pml);
368369

369370
/* if that module doesn't match my own, return an error */
370371
if ((size != strlen(my_pml) + 1) ||
371372
(0 != strcmp(my_pml, remote_pml))) {
373+
char *errhost = NULL;
374+
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_HOSTNAME, &proc_name,
375+
&(errhost), OPAL_STRING);
372376
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
373377
OMPI_NAME_PRINT(&ompi_proc_local()->super.proc_name),
374-
my_pml, OMPI_NAME_PRINT(&procs[0]->super.proc_name),
375-
(NULL == procs[0]->super.proc_hostname) ? "unknown" : procs[0]->super.proc_hostname,
378+
my_pml, OMPI_NAME_PRINT(&proc_name),
379+
(NULL == errhost) ? "unknown" : errhost,
376380
remote_pml);
377-
free(remote_pml); /* cleanup before returning */
381+
free(remote_pml);
382+
free(errhost);
383+
/* cleanup before returning */
378384
return OMPI_ERR_UNREACH;
379385
}
380386

381387
free(remote_pml);
382388
return OMPI_SUCCESS;
383389
}
390+
391+
int
392+
mca_pml_base_pml_check_selected(const char *my_pml,
393+
ompi_proc_t **procs,
394+
size_t nprocs)
395+
{
396+
int ret = 0;
397+
size_t i;
398+
399+
if (!opal_pmix_collect_all_data) {
400+
/*
401+
* If direct modex, then compare our PML with the peer's PML
402+
* for all procs
403+
*/
404+
for (i = 0; i < nprocs; i++) {
405+
ret = mca_pml_base_pml_check_selected_impl(
406+
my_pml,
407+
procs[i]->super.proc_name);
408+
if (ret) {
409+
return ret;
410+
}
411+
}
412+
} else {
413+
/* else if full modex compare our PML with rank 0 */
414+
opal_process_name_t proc_name = {
415+
.jobid = ompi_proc_local()->super.proc_name.jobid,
416+
.vpid = 0
417+
};
418+
ret = mca_pml_base_pml_check_selected_impl(
419+
my_pml,
420+
proc_name);
421+
}
422+
423+
return ret;
424+
}

0 commit comments

Comments
 (0)