Skip to content

Commit 00de8b7

Browse files
committed
Use the fallback after disqualifying HAN
Instead of calling the communicator collective module after disqualifying HAN call directly the fallback collective. This avoids circular dependencies between modules in some cases. However, while this solution works it deliver suboptimal performance as the withdrawn module will remain in the call chain, but will behave as a passthrough. Signed-off-by: George Bosilca <[email protected]>
1 parent ffab0a4 commit 00de8b7

11 files changed

+71
-76
lines changed

ompi/mca/coll/cuda/coll_cuda_component.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ mca_coll_cuda_component_t mca_coll_cuda_component = {
6767
/* cuda-specific component information */
6868

6969
/* Priority: make it above all point to point collectives including self */
70-
78,
70+
.priority = 78,
7171
};
7272

7373

ompi/mca/coll/han/coll_han.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -377,9 +377,9 @@ OBJ_CLASS_DECLARATION(mca_coll_han_module_t);
377377
#define HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, COLL) \
378378
do { \
379379
if ( ((COMM)->c_coll->coll_ ## COLL ## _module) == (mca_coll_base_module_t*)(HANM) ) { \
380-
(COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.module_fn.COLL; \
380+
(COMM)->c_coll->coll_ ## COLL = (HANM)->previous_## COLL; \
381381
mca_coll_base_module_t *coll_module = (COMM)->c_coll->coll_ ## COLL ## _module; \
382-
(COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \
382+
(COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->previous_ ## COLL ## _module; \
383383
OBJ_RETAIN((COMM)->c_coll->coll_ ## COLL ## _module); \
384384
OBJ_RELEASE(coll_module); \
385385
} \

ompi/mca/coll/han/coll_han_allgather.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018-2020 The University of Tennessee and The University
2+
* Copyright (c) 2018-2023 The University of Tennessee and The University
33
* of Tennessee Research Foundation. All rights
44
* reserved.
55
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
@@ -83,8 +83,8 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount,
8383
"han cannot handle allgather within this communicator. Fall back on another component\n"));
8484
/* HAN cannot work with this communicator so fallback on all collectives */
8585
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
86-
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
87-
comm, comm->c_coll->coll_allgather_module);
86+
return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
87+
comm, han_module->previous_allgather_module);
8888
}
8989
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
9090
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
@@ -98,8 +98,8 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount,
9898
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
9999
"han cannot handle allgather with this communicator (imbalance). Fall back on another component\n"));
100100
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather);
101-
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
102-
comm, comm->c_coll->coll_allgather_module);
101+
return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
102+
comm, han_module->previous_allgather_module);
103103
}
104104

105105
ompi_request_t *temp_request;
@@ -307,8 +307,8 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
307307
"han cannot handle allgather within this communicator. Fall back on another component\n"));
308308
/* HAN cannot work with this communicator so fallback on all collectives */
309309
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
310-
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
311-
comm, comm->c_coll->coll_allgather_module);
310+
return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
311+
comm, han_module->previous_allgather_module);
312312
}
313313
/* discovery topology */
314314
int *topo = mca_coll_han_topo_init(comm, han_module, 2);
@@ -321,8 +321,8 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
321321
* future calls will then be automatically redirected.
322322
*/
323323
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather);
324-
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
325-
comm, comm->c_coll->coll_allgather_module);
324+
return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
325+
comm, han_module->previous_allgather_module);
326326
}
327327

328328
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];

ompi/mca/coll/han/coll_han_allreduce.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018-2020 The University of Tennessee and The University
2+
* Copyright (c) 2018-2023 The University of Tennessee and The University
33
* of Tennessee Research Foundation. All rights
44
* reserved.
55
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
@@ -110,8 +110,8 @@ mca_coll_han_allreduce_intra(const void *sbuf,
110110
"han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n"));
111111
/* HAN cannot work with this communicator so fallback on all collectives */
112112
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
113-
return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op,
114-
comm, comm->c_coll->coll_allreduce_module);
113+
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op,
114+
comm, han_module->previous_allreduce_module);
115115
}
116116

117117
ptrdiff_t extent, lb;
@@ -450,8 +450,8 @@ mca_coll_han_allreduce_intra_simple(const void *sbuf,
450450
"han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n"));
451451
/* HAN cannot work with this communicator so fallback on all collectives */
452452
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
453-
return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op,
454-
comm, comm->c_coll->coll_allreduce_module);
453+
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op,
454+
comm, han_module->previous_allreduce_module);
455455
}
456456

457457
low_comm = han_module->sub_comm[INTRA_NODE];

ompi/mca/coll/han/coll_han_barrier.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018-2020 The University of Tennessee and The University
2+
* Copyright (c) 2018-2023 The University of Tennessee and The University
33
* of Tennessee Research Foundation. All rights
44
* reserved.
55
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
@@ -40,7 +40,7 @@ mca_coll_han_barrier_intra_simple(struct ompi_communicator_t *comm,
4040
* future calls will then be automatically redirected.
4141
*/
4242
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
43-
return comm->c_coll->coll_barrier(comm, comm->c_coll->coll_bcast_module);
43+
return han_module->previous_barrier(comm, han_module->previous_barrier_module);
4444
}
4545

4646
low_comm = han_module->sub_comm[INTRA_NODE];

ompi/mca/coll/han/coll_han_bcast.c

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018-2020 The University of Tennessee and The University
2+
* Copyright (c) 2018-2023 The University of Tennessee and The University
33
* of Tennessee Research Foundation. All rights
44
* reserved.
55
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
@@ -63,7 +63,7 @@ mca_coll_han_set_bcast_args(mca_coll_han_bcast_args_t * args, mca_coll_task_t *
6363
* iter 4 | | | | lb | task: t1, contains lb
6464
*/
6565
int
66-
mca_coll_han_bcast_intra(void *buff,
66+
mca_coll_han_bcast_intra(void *buf,
6767
int count,
6868
struct ompi_datatype_t *dtype,
6969
int root,
@@ -84,8 +84,8 @@ mca_coll_han_bcast_intra(void *buff,
8484
* future calls will then be automatically redirected.
8585
*/
8686
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
87-
return comm->c_coll->coll_bcast(buff, count, dtype, root,
88-
comm, comm->c_coll->coll_bcast_module);
87+
return han_module->previous_bcast(buf, count, dtype, root,
88+
comm, han_module->previous_bcast_module);
8989
}
9090
/* Topo must be initialized to know rank distribution which then is used to
9191
* determine if han can be used */
@@ -97,8 +97,8 @@ mca_coll_han_bcast_intra(void *buff,
9797
* future calls will then be automatically redirected.
9898
*/
9999
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast);
100-
return comm->c_coll->coll_bcast(buff, count, dtype, root,
101-
comm, comm->c_coll->coll_bcast_module);
100+
return han_module->previous_bcast(buf, count, dtype, root,
101+
comm, han_module->previous_bcast_module);
102102
}
103103

104104
ompi_datatype_get_extent(dtype, &lb, &extent);
@@ -129,7 +129,7 @@ mca_coll_han_bcast_intra(void *buff,
129129
mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t);
130130
/* Setup up t0 task arguments */
131131
mca_coll_han_bcast_args_t *t = malloc(sizeof(mca_coll_han_bcast_args_t));
132-
mca_coll_han_set_bcast_args(t, t0, (char *) buff, seg_count, dtype,
132+
mca_coll_han_set_bcast_args(t, t0, (char *)buf, seg_count, dtype,
133133
root_up_rank, root_low_rank, up_comm, low_comm,
134134
num_segments, 0, w_rank, count - (num_segments - 1) * seg_count,
135135
low_rank != root_low_rank);
@@ -222,7 +222,7 @@ int mca_coll_han_bcast_t1_task(void *task_args)
222222
* communications without tasks.
223223
*/
224224
int
225-
mca_coll_han_bcast_intra_simple(void *buff,
225+
mca_coll_han_bcast_intra_simple(void *buf,
226226
int count,
227227
struct ompi_datatype_t *dtype,
228228
int root,
@@ -246,8 +246,8 @@ mca_coll_han_bcast_intra_simple(void *buff,
246246
* future calls will then be automatically redirected.
247247
*/
248248
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
249-
return comm->c_coll->coll_bcast(buff, count, dtype, root,
250-
comm, comm->c_coll->coll_bcast_module);
249+
return han_module->previous_bcast(buf, count, dtype, root,
250+
comm, han_module->previous_bcast_module);
251251
}
252252
/* Topo must be initialized to know rank distribution which then is used to
253253
* determine if han can be used */
@@ -259,8 +259,8 @@ mca_coll_han_bcast_intra_simple(void *buff,
259259
* future calls will then be automatically redirected.
260260
*/
261261
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast);
262-
return comm->c_coll->coll_bcast(buff, count, dtype, root,
263-
comm, comm->c_coll->coll_bcast_module);
262+
return han_module->previous_bcast(buf, count, dtype, root,
263+
comm, han_module->previous_bcast_module);
264264
}
265265

266266
low_comm = han_module->sub_comm[INTRA_NODE];
@@ -277,18 +277,18 @@ mca_coll_han_bcast_intra_simple(void *buff,
277277
w_rank, root_low_rank, root_up_rank));
278278

279279
if (low_rank == root_low_rank) {
280-
up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank,
280+
up_comm->c_coll->coll_bcast(buf, count, dtype, root_up_rank,
281281
up_comm, up_comm->c_coll->coll_bcast_module);
282282

283283
/* To remove when han has better sub-module selection.
284284
For now switching to ibcast enables to make runs with libnbc. */
285285
//ompi_request_t req;
286-
//up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank,
286+
//up_comm->c_coll->coll_ibcast(buf, count, dtype, root_up_rank,
287287
// up_comm, &req, up_comm->c_coll->coll_ibcast_module);
288288
//ompi_request_wait(&req, MPI_STATUS_IGNORE);
289289

290290
}
291-
low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank,
291+
low_comm->c_coll->coll_bcast(buf, count, dtype, root_low_rank,
292292
low_comm, low_comm->c_coll->coll_bcast_module);
293293

294294
return OMPI_SUCCESS;

ompi/mca/coll/han/coll_han_component.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ mca_coll_han_component_t mca_coll_han_component = {
9090
/* han-component specific information */
9191

9292
/* (default) priority */
93-
.han_priority = 20,
93+
.han_priority = 35,
9494
/* workaround for nvcc compiler */
9595
.dynamic_rules_filename = NULL,
9696
};
@@ -251,7 +251,6 @@ static int han_register(void)
251251
TOPO_LVL_T topo_lvl;
252252
COMPONENT_T component;
253253

254-
cs->han_priority = 35;
255254
(void) mca_base_component_var_register(c, "priority", "Priority of the HAN coll component",
256255
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
257256
OPAL_INFO_LVL_9,

ompi/mca/coll/han/coll_han_dynamic.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -615,8 +615,8 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf,
615615
int rank, verbosity = 0;
616616

617617
if (!han_module->enabled) {
618-
return han_module->fallback.allreduce.module_fn.allreduce(sbuf, rbuf, count, dtype, op, comm,
619-
han_module->fallback.allreduce.module);
618+
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, comm,
619+
han_module->previous_allreduce_module);
620620
}
621621

622622
/* Compute configuration information for dynamic rules */
@@ -728,7 +728,7 @@ mca_coll_han_barrier_intra_dynamic(struct ompi_communicator_t *comm,
728728
int rank, verbosity = 0;
729729

730730
if (!han_module->enabled) {
731-
return han_module->fallback.barrier.module_fn.barrier(comm, han_module->fallback.barrier.module);
731+
return han_module->previous_barrier(comm, han_module->previous_barrier_module);
732732
}
733733

734734
/* Compute configuration information for dynamic rules */
@@ -830,8 +830,8 @@ mca_coll_han_bcast_intra_dynamic(void *buff,
830830
int rank, verbosity = 0;
831831

832832
if (!han_module->enabled) {
833-
return han_module->fallback.bcast.module_fn.bcast(buff, count, dtype, root, comm,
834-
han_module->fallback.bcast.module);
833+
return han_module->previous_bcast(buff, count, dtype, root, comm,
834+
han_module->previous_bcast_module);
835835
}
836836

837837
/* Compute configuration information for dynamic rules */
@@ -946,8 +946,8 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount,
946946
int rank, verbosity = 0;
947947

948948
if (!han_module->enabled) {
949-
return han_module->fallback.gather.module_fn.gather(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm,
950-
han_module->fallback.gather.module);
949+
return han_module->previous_gather(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm,
950+
han_module->previous_gather_module);
951951
}
952952

953953
/* Compute configuration information for dynamic rules */
@@ -1070,8 +1070,8 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf,
10701070
int rank, verbosity = 0;
10711071

10721072
if (!han_module->enabled) {
1073-
return han_module->fallback.reduce.module_fn.reduce(sbuf, rbuf, count, dtype, op, root, comm,
1074-
han_module->fallback.reduce.module);
1073+
return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, comm,
1074+
han_module->previous_reduce_module);
10751075
}
10761076

10771077
/* Compute configuration information for dynamic rules */
@@ -1191,8 +1191,8 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount,
11911191
int rank, verbosity = 0;
11921192

11931193
if (!han_module->enabled) {
1194-
return han_module->fallback.scatter.module_fn.scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm,
1195-
han_module->fallback.scatter.module);
1194+
return han_module->previous_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm,
1195+
han_module->previous_scatter_module);
11961196
}
11971197

11981198
/* Compute configuration information for dynamic rules */

ompi/mca/coll/han/coll_han_gather.c

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018-2020 The University of Tennessee and The University
2+
* Copyright (c) 2018-2023 The University of Tennessee and The University
33
* of Tennessee Research Foundation. All rights
44
* reserved.
55
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
@@ -93,9 +93,8 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
9393
"han cannot handle gather with this communicator. Fall back on another component\n"));
9494
/* HAN cannot work with this communicator so fallback on all collectives */
9595
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
96-
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
97-
rcount, rdtype, root,
98-
comm, comm->c_coll->coll_gather_module);
96+
return han_module->previous_gather(sbuf, scount, sdtype, rbuf, rcount, rdtype, root,
97+
comm, han_module->previous_gather_module);
9998
}
10099

101100
/* Topo must be initialized to know rank distribution which then is used to
@@ -108,9 +107,8 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
108107
* future calls will then be automatically redirected.
109108
*/
110109
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather);
111-
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
112-
rcount, rdtype, root,
113-
comm, comm->c_coll->coll_gather_module);
110+
return han_module->previous_gather(sbuf, scount, sdtype, rbuf, rcount, rdtype, root,
111+
comm, han_module->previous_gather_module);
114112
}
115113

116114
w_rank = ompi_comm_rank(comm);
@@ -359,9 +357,8 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
359357
"han cannot handle gather with this communicator. Fall back on another component\n"));
360358
/* HAN cannot work with this communicator so fallback on all collectives */
361359
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
362-
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
363-
rcount, rdtype, root,
364-
comm, comm->c_coll->coll_gather_module);
360+
return han_module->previous_gather(sbuf, scount, sdtype, rbuf, rcount, rdtype, root,
361+
comm, han_module->previous_gather_module);
365362
}
366363

367364
/* Topo must be initialized to know rank distribution which then is used to
@@ -374,9 +371,8 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
374371
* future calls will then be automatically redirected.
375372
*/
376373
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather);
377-
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
378-
rcount, rdtype, root,
379-
comm, comm->c_coll->coll_gather_module);
374+
return han_module->previous_gather(sbuf, scount, sdtype, rbuf, rcount, rdtype, root,
375+
comm, han_module->previous_gather_module);
380376
}
381377

382378
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];

0 commit comments

Comments
 (0)