Skip to content

Commit e0bf553

Browse files
authored
Merge pull request #11454 from wzamazon/coll_han_fix_infinite_loop
coll/han: call fallback functin when HAN module is disabled
2 parents cc9883d + 00de8b7 commit e0bf553

11 files changed

+88
-65
lines changed

ompi/mca/coll/cuda/coll_cuda_component.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ mca_coll_cuda_component_t mca_coll_cuda_component = {
6767
/* cuda-specific component information */
6868

6969
/* Priority: make it above all point to point collectives including self */
70-
78,
70+
.priority = 78,
7171
};
7272

7373

ompi/mca/coll/han/coll_han.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -377,9 +377,9 @@ OBJ_CLASS_DECLARATION(mca_coll_han_module_t);
377377
#define HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, COLL) \
378378
do { \
379379
if ( ((COMM)->c_coll->coll_ ## COLL ## _module) == (mca_coll_base_module_t*)(HANM) ) { \
380-
(COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.module_fn.COLL; \
380+
(COMM)->c_coll->coll_ ## COLL = (HANM)->previous_## COLL; \
381381
mca_coll_base_module_t *coll_module = (COMM)->c_coll->coll_ ## COLL ## _module; \
382-
(COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \
382+
(COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->previous_ ## COLL ## _module; \
383383
OBJ_RETAIN((COMM)->c_coll->coll_ ## COLL ## _module); \
384384
OBJ_RELEASE(coll_module); \
385385
} \

ompi/mca/coll/han/coll_han_allgather.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018-2020 The University of Tennessee and The University
2+
* Copyright (c) 2018-2023 The University of Tennessee and The University
33
* of Tennessee Research Foundation. All rights
44
* reserved.
55
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
@@ -83,8 +83,8 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount,
8383
"han cannot handle allgather within this communicator. Fall back on another component\n"));
8484
/* HAN cannot work with this communicator so fallback on all collectives */
8585
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
86-
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
87-
comm, comm->c_coll->coll_allgather_module);
86+
return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
87+
comm, han_module->previous_allgather_module);
8888
}
8989
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
9090
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
@@ -98,8 +98,8 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount,
9898
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
9999
"han cannot handle allgather with this communicator (imbalance). Fall back on another component\n"));
100100
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather);
101-
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
102-
comm, comm->c_coll->coll_allgather_module);
101+
return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
102+
comm, han_module->previous_allgather_module);
103103
}
104104

105105
ompi_request_t *temp_request;
@@ -307,8 +307,8 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
307307
"han cannot handle allgather within this communicator. Fall back on another component\n"));
308308
/* HAN cannot work with this communicator so fallback on all collectives */
309309
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
310-
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
311-
comm, comm->c_coll->coll_allgather_module);
310+
return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
311+
comm, han_module->previous_allgather_module);
312312
}
313313
/* discovery topology */
314314
int *topo = mca_coll_han_topo_init(comm, han_module, 2);
@@ -321,8 +321,8 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
321321
* future calls will then be automatically redirected.
322322
*/
323323
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather);
324-
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
325-
comm, comm->c_coll->coll_allgather_module);
324+
return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
325+
comm, han_module->previous_allgather_module);
326326
}
327327

328328
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];

ompi/mca/coll/han/coll_han_allreduce.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018-2020 The University of Tennessee and The University
2+
* Copyright (c) 2018-2023 The University of Tennessee and The University
33
* of Tennessee Research Foundation. All rights
44
* reserved.
55
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
@@ -110,8 +110,8 @@ mca_coll_han_allreduce_intra(const void *sbuf,
110110
"han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n"));
111111
/* HAN cannot work with this communicator so fallback on all collectives */
112112
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
113-
return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op,
114-
comm, comm->c_coll->coll_allreduce_module);
113+
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op,
114+
comm, han_module->previous_allreduce_module);
115115
}
116116

117117
ptrdiff_t extent, lb;
@@ -450,8 +450,8 @@ mca_coll_han_allreduce_intra_simple(const void *sbuf,
450450
"han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n"));
451451
/* HAN cannot work with this communicator so fallback on all collectives */
452452
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
453-
return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op,
454-
comm, comm->c_coll->coll_allreduce_module);
453+
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op,
454+
comm, han_module->previous_allreduce_module);
455455
}
456456

457457
low_comm = han_module->sub_comm[INTRA_NODE];

ompi/mca/coll/han/coll_han_barrier.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018-2020 The University of Tennessee and The University
2+
* Copyright (c) 2018-2023 The University of Tennessee and The University
33
* of Tennessee Research Foundation. All rights
44
* reserved.
55
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
@@ -40,7 +40,7 @@ mca_coll_han_barrier_intra_simple(struct ompi_communicator_t *comm,
4040
* future calls will then be automatically redirected.
4141
*/
4242
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
43-
return comm->c_coll->coll_barrier(comm, comm->c_coll->coll_bcast_module);
43+
return han_module->previous_barrier(comm, han_module->previous_barrier_module);
4444
}
4545

4646
low_comm = han_module->sub_comm[INTRA_NODE];

ompi/mca/coll/han/coll_han_bcast.c

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018-2020 The University of Tennessee and The University
2+
* Copyright (c) 2018-2023 The University of Tennessee and The University
33
* of Tennessee Research Foundation. All rights
44
* reserved.
55
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
@@ -63,7 +63,7 @@ mca_coll_han_set_bcast_args(mca_coll_han_bcast_args_t * args, mca_coll_task_t *
6363
* iter 4 | | | | lb | task: t1, contains lb
6464
*/
6565
int
66-
mca_coll_han_bcast_intra(void *buff,
66+
mca_coll_han_bcast_intra(void *buf,
6767
int count,
6868
struct ompi_datatype_t *dtype,
6969
int root,
@@ -84,8 +84,8 @@ mca_coll_han_bcast_intra(void *buff,
8484
* future calls will then be automatically redirected.
8585
*/
8686
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
87-
return comm->c_coll->coll_bcast(buff, count, dtype, root,
88-
comm, comm->c_coll->coll_bcast_module);
87+
return han_module->previous_bcast(buf, count, dtype, root,
88+
comm, han_module->previous_bcast_module);
8989
}
9090
/* Topo must be initialized to know rank distribution which then is used to
9191
* determine if han can be used */
@@ -97,8 +97,8 @@ mca_coll_han_bcast_intra(void *buff,
9797
* future calls will then be automatically redirected.
9898
*/
9999
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast);
100-
return comm->c_coll->coll_bcast(buff, count, dtype, root,
101-
comm, comm->c_coll->coll_bcast_module);
100+
return han_module->previous_bcast(buf, count, dtype, root,
101+
comm, han_module->previous_bcast_module);
102102
}
103103

104104
ompi_datatype_get_extent(dtype, &lb, &extent);
@@ -129,7 +129,7 @@ mca_coll_han_bcast_intra(void *buff,
129129
mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t);
130130
/* Setup up t0 task arguments */
131131
mca_coll_han_bcast_args_t *t = malloc(sizeof(mca_coll_han_bcast_args_t));
132-
mca_coll_han_set_bcast_args(t, t0, (char *) buff, seg_count, dtype,
132+
mca_coll_han_set_bcast_args(t, t0, (char *)buf, seg_count, dtype,
133133
root_up_rank, root_low_rank, up_comm, low_comm,
134134
num_segments, 0, w_rank, count - (num_segments - 1) * seg_count,
135135
low_rank != root_low_rank);
@@ -222,7 +222,7 @@ int mca_coll_han_bcast_t1_task(void *task_args)
222222
* communications without tasks.
223223
*/
224224
int
225-
mca_coll_han_bcast_intra_simple(void *buff,
225+
mca_coll_han_bcast_intra_simple(void *buf,
226226
int count,
227227
struct ompi_datatype_t *dtype,
228228
int root,
@@ -246,8 +246,8 @@ mca_coll_han_bcast_intra_simple(void *buff,
246246
* future calls will then be automatically redirected.
247247
*/
248248
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
249-
return comm->c_coll->coll_bcast(buff, count, dtype, root,
250-
comm, comm->c_coll->coll_bcast_module);
249+
return han_module->previous_bcast(buf, count, dtype, root,
250+
comm, han_module->previous_bcast_module);
251251
}
252252
/* Topo must be initialized to know rank distribution which then is used to
253253
* determine if han can be used */
@@ -259,8 +259,8 @@ mca_coll_han_bcast_intra_simple(void *buff,
259259
* future calls will then be automatically redirected.
260260
*/
261261
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast);
262-
return comm->c_coll->coll_bcast(buff, count, dtype, root,
263-
comm, comm->c_coll->coll_bcast_module);
262+
return han_module->previous_bcast(buf, count, dtype, root,
263+
comm, han_module->previous_bcast_module);
264264
}
265265

266266
low_comm = han_module->sub_comm[INTRA_NODE];
@@ -277,18 +277,18 @@ mca_coll_han_bcast_intra_simple(void *buff,
277277
w_rank, root_low_rank, root_up_rank));
278278

279279
if (low_rank == root_low_rank) {
280-
up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank,
280+
up_comm->c_coll->coll_bcast(buf, count, dtype, root_up_rank,
281281
up_comm, up_comm->c_coll->coll_bcast_module);
282282

283283
/* To remove when han has better sub-module selection.
284284
For now switching to ibcast enables to make runs with libnbc. */
285285
//ompi_request_t req;
286-
//up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank,
286+
//up_comm->c_coll->coll_ibcast(buf, count, dtype, root_up_rank,
287287
// up_comm, &req, up_comm->c_coll->coll_ibcast_module);
288288
//ompi_request_wait(&req, MPI_STATUS_IGNORE);
289289

290290
}
291-
low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank,
291+
low_comm->c_coll->coll_bcast(buf, count, dtype, root_low_rank,
292292
low_comm, low_comm->c_coll->coll_bcast_module);
293293

294294
return OMPI_SUCCESS;

ompi/mca/coll/han/coll_han_component.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ mca_coll_han_component_t mca_coll_han_component = {
9090
/* han-component specific information */
9191

9292
/* (default) priority */
93-
.han_priority = 20,
93+
.han_priority = 35,
9494
/* workaround for nvcc compiler */
9595
.dynamic_rules_filename = NULL,
9696
};
@@ -251,7 +251,6 @@ static int han_register(void)
251251
TOPO_LVL_T topo_lvl;
252252
COMPONENT_T component;
253253

254-
cs->han_priority = 35;
255254
(void) mca_base_component_var_register(c, "priority", "Priority of the HAN coll component",
256255
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
257256
OPAL_INFO_LVL_9,

ompi/mca/coll/han/coll_han_dynamic.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,11 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf,
614614
size_t dtype_size;
615615
int rank, verbosity = 0;
616616

617+
if (!han_module->enabled) {
618+
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, comm,
619+
han_module->previous_allreduce_module);
620+
}
621+
617622
/* Compute configuration information for dynamic rules */
618623
ompi_datatype_type_size(dtype, &dtype_size);
619624
dtype_size = dtype_size * count;
@@ -722,6 +727,9 @@ mca_coll_han_barrier_intra_dynamic(struct ompi_communicator_t *comm,
722727
mca_coll_base_module_t *sub_module;
723728
int rank, verbosity = 0;
724729

730+
if (!han_module->enabled) {
731+
return han_module->previous_barrier(comm, han_module->previous_barrier_module);
732+
}
725733

726734
/* Compute configuration information for dynamic rules */
727735
sub_module = get_module(BARRIER,
@@ -821,6 +829,11 @@ mca_coll_han_bcast_intra_dynamic(void *buff,
821829
size_t dtype_size;
822830
int rank, verbosity = 0;
823831

832+
if (!han_module->enabled) {
833+
return han_module->previous_bcast(buff, count, dtype, root, comm,
834+
han_module->previous_bcast_module);
835+
}
836+
824837
/* Compute configuration information for dynamic rules */
825838
ompi_datatype_type_size(dtype, &dtype_size);
826839
dtype_size = dtype_size * count;
@@ -932,6 +945,11 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount,
932945
size_t dtype_size;
933946
int rank, verbosity = 0;
934947

948+
if (!han_module->enabled) {
949+
return han_module->previous_gather(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm,
950+
han_module->previous_gather_module);
951+
}
952+
935953
/* Compute configuration information for dynamic rules */
936954
if( MPI_IN_PLACE != sbuf ) {
937955
ompi_datatype_type_size(sdtype, &dtype_size);
@@ -1051,6 +1069,11 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf,
10511069
size_t dtype_size;
10521070
int rank, verbosity = 0;
10531071

1072+
if (!han_module->enabled) {
1073+
return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, comm,
1074+
han_module->previous_reduce_module);
1075+
}
1076+
10541077
/* Compute configuration information for dynamic rules */
10551078
ompi_datatype_type_size(dtype, &dtype_size);
10561079
dtype_size = dtype_size * count;
@@ -1167,6 +1190,11 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount,
11671190
size_t dtype_size;
11681191
int rank, verbosity = 0;
11691192

1193+
if (!han_module->enabled) {
1194+
return han_module->previous_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm,
1195+
han_module->previous_scatter_module);
1196+
}
1197+
11701198
/* Compute configuration information for dynamic rules */
11711199
if( MPI_IN_PLACE != rbuf ) {
11721200
ompi_datatype_type_size(rdtype, &dtype_size);

ompi/mca/coll/han/coll_han_gather.c

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018-2020 The University of Tennessee and The University
2+
* Copyright (c) 2018-2023 The University of Tennessee and The University
33
* of Tennessee Research Foundation. All rights
44
* reserved.
55
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
@@ -93,9 +93,8 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
9393
"han cannot handle gather with this communicator. Fall back on another component\n"));
9494
/* HAN cannot work with this communicator so fallback on all collectives */
9595
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
96-
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
97-
rcount, rdtype, root,
98-
comm, comm->c_coll->coll_gather_module);
96+
return han_module->previous_gather(sbuf, scount, sdtype, rbuf, rcount, rdtype, root,
97+
comm, han_module->previous_gather_module);
9998
}
10099

101100
/* Topo must be initialized to know rank distribution which then is used to
@@ -108,9 +107,8 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
108107
* future calls will then be automatically redirected.
109108
*/
110109
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather);
111-
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
112-
rcount, rdtype, root,
113-
comm, comm->c_coll->coll_gather_module);
110+
return han_module->previous_gather(sbuf, scount, sdtype, rbuf, rcount, rdtype, root,
111+
comm, han_module->previous_gather_module);
114112
}
115113

116114
w_rank = ompi_comm_rank(comm);
@@ -359,9 +357,8 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
359357
"han cannot handle gather with this communicator. Fall back on another component\n"));
360358
/* HAN cannot work with this communicator so fallback on all collectives */
361359
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
362-
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
363-
rcount, rdtype, root,
364-
comm, comm->c_coll->coll_gather_module);
360+
return han_module->previous_gather(sbuf, scount, sdtype, rbuf, rcount, rdtype, root,
361+
comm, han_module->previous_gather_module);
365362
}
366363

367364
/* Topo must be initialized to know rank distribution which then is used to
@@ -374,9 +371,8 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
374371
* future calls will then be automatically redirected.
375372
*/
376373
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather);
377-
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
378-
rcount, rdtype, root,
379-
comm, comm->c_coll->coll_gather_module);
374+
return han_module->previous_gather(sbuf, scount, sdtype, rbuf, rcount, rdtype, root,
375+
comm, han_module->previous_gather_module);
380376
}
381377

382378
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];

0 commit comments

Comments
 (0)