Skip to content

Commit acbe7b0

Browse files
authored
Merge pull request #9942 from jjhursey/big-payload-inter-coll
Fix intercommunicator overflow with big payload collectives
2 parents 2d2d0a7 + fe07940 commit acbe7b0

6 files changed

+42
-16
lines changed

ompi/mca/coll/inter/coll_inter_allgather.c

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
1313
* Copyright (c) 2015-2017 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
15+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -48,9 +49,10 @@ mca_coll_inter_allgather_inter(const void *sbuf, int scount,
4849
struct ompi_communicator_t *comm,
4950
mca_coll_base_module_t *module)
5051
{
51-
int rank, root = 0, size, rsize, err = OMPI_SUCCESS;
52+
int rank, root = 0, size, rsize, err = OMPI_SUCCESS, i;
5253
char *ptmp_free = NULL, *ptmp = NULL;
5354
ptrdiff_t gap, span;
55+
void *rbuf_ptr;
5456

5557
rank = ompi_comm_rank(comm);
5658
size = ompi_comm_size(comm->c_local_comm);
@@ -76,9 +78,9 @@ mca_coll_inter_allgather_inter(const void *sbuf, int scount,
7678

7779
if (rank == root) {
7880
/* Do a send-recv between the two root procs. to avoid deadlock */
79-
err = ompi_coll_base_sendrecv_actual(ptmp, scount*size, sdtype, 0,
81+
err = ompi_coll_base_sendrecv_actual(ptmp, scount*(size_t)size, sdtype, 0,
8082
MCA_COLL_BASE_TAG_ALLGATHER,
81-
rbuf, rcount*rsize, rdtype, 0,
83+
rbuf, rcount*(size_t)rsize, rdtype, 0,
8284
MCA_COLL_BASE_TAG_ALLGATHER,
8385
comm, MPI_STATUS_IGNORE);
8486
if (OMPI_SUCCESS != err) {
@@ -87,12 +89,28 @@ mca_coll_inter_allgather_inter(const void *sbuf, int scount,
8789
}
8890
/* bcast the message to all the local processes */
8991
if ( rcount > 0 ) {
90-
err = comm->c_local_comm->c_coll->coll_bcast(rbuf, rcount*rsize, rdtype,
91-
root, comm->c_local_comm,
92-
comm->c_local_comm->c_coll->coll_bcast_module);
93-
if (OMPI_SUCCESS != err) {
94-
goto exit;
95-
}
92+
if ( OPAL_UNLIKELY(rcount*(size_t)rsize > INT_MAX) ) {
93+
// Sending the message in the coll_bcast as "rcount*rsize" would exceed
94+
// the 'int count' parameter in the coll_bcast() function. Instead broadcast
95+
// the result in "rcount" chunks to the local group.
96+
span = opal_datatype_span(&rdtype->super, rcount, &gap);
97+
for( i = 0; i < rsize; ++i) {
98+
rbuf_ptr = (char*)rbuf + span * (size_t)i;
99+
err = comm->c_local_comm->c_coll->coll_bcast(rbuf_ptr, rcount, rdtype,
100+
root, comm->c_local_comm,
101+
comm->c_local_comm->c_coll->coll_bcast_module);
102+
if (OMPI_SUCCESS != err) {
103+
goto exit;
104+
}
105+
}
106+
} else {
107+
err = comm->c_local_comm->c_coll->coll_bcast(rbuf, rcount*rsize, rdtype,
108+
root, comm->c_local_comm,
109+
comm->c_local_comm->c_coll->coll_bcast_module);
110+
if (OMPI_SUCCESS != err) {
111+
goto exit;
112+
}
113+
}
96114
}
97115

98116
exit:

ompi/mca/coll/inter/coll_inter_allgatherv.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
1313
* Copyright (c) 2015-2017 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
15+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -47,7 +48,8 @@ mca_coll_inter_allgatherv_inter(const void *sbuf, int scount,
4748
struct ompi_communicator_t *comm,
4849
mca_coll_base_module_t *module)
4950
{
50-
int i, rank, size, size_local, total=0, err;
51+
int i, rank, size, size_local, err;
52+
size_t total = 0;
5153
int *count=NULL,*displace=NULL;
5254
char *ptmp_free=NULL, *ptmp=NULL;
5355
ompi_datatype_t *ndtype = NULL;

ompi/mca/coll/inter/coll_inter_gather.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Copyright (c) 2006-2007 University of Houston. All rights reserved.
1313
* Copyright (c) 2015-2016 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
15+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -76,7 +77,7 @@ mca_coll_inter_gather_inter(const void *sbuf, int scount,
7677
comm->c_local_comm->c_coll->coll_gather_module);
7778
if (0 == rank) {
7879
/* First process sends data to the root */
79-
err = MCA_PML_CALL(send(ptmp, scount*size_local, sdtype, root,
80+
err = MCA_PML_CALL(send(ptmp, scount*(size_t)size_local, sdtype, root,
8081
MCA_COLL_BASE_TAG_GATHER,
8182
MCA_PML_BASE_SEND_STANDARD, comm));
8283
if (OMPI_SUCCESS != err) {
@@ -86,7 +87,7 @@ mca_coll_inter_gather_inter(const void *sbuf, int scount,
8687
free(ptmp_free);
8788
} else {
8889
/* I am the root, loop receiving the data. */
89-
err = MCA_PML_CALL(recv(rbuf, rcount*size, rdtype, 0,
90+
err = MCA_PML_CALL(recv(rbuf, rcount*(size_t)size, rdtype, 0,
9091
MCA_COLL_BASE_TAG_GATHER,
9192
comm, MPI_STATUS_IGNORE));
9293
if (OMPI_SUCCESS != err) {

ompi/mca/coll/inter/coll_inter_gatherv.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
1313
* Copyright (c) 2015-2016 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
15+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -44,7 +45,8 @@ mca_coll_inter_gatherv_inter(const void *sbuf, int scount,
4445
struct ompi_communicator_t *comm,
4546
mca_coll_base_module_t *module)
4647
{
47-
int i, rank, size, size_local, total=0, err;
48+
int i, rank, size, size_local, err;
49+
size_t total = 0;
4850
int *count=NULL, *displace=NULL;
4951
char *ptmp_free=NULL, *ptmp=NULL;
5052
ompi_datatype_t *ndtype;

ompi/mca/coll/inter/coll_inter_scatter.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
1313
* Copyright (c) 2015-2016 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
15+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -69,7 +70,7 @@ mca_coll_inter_scatter_inter(const void *sbuf, int scount,
6970
}
7071
ptmp = ptmp_free - gap;
7172

72-
err = MCA_PML_CALL(recv(ptmp, rcount*size_local, rdtype,
73+
err = MCA_PML_CALL(recv(ptmp, rcount*(size_t)size_local, rdtype,
7374
root, MCA_COLL_BASE_TAG_SCATTER,
7475
comm, MPI_STATUS_IGNORE));
7576
if (OMPI_SUCCESS != err) {
@@ -86,7 +87,7 @@ mca_coll_inter_scatter_inter(const void *sbuf, int scount,
8687
}
8788
} else {
8889
/* Root sends data to the first process in the remote group */
89-
err = MCA_PML_CALL(send(sbuf, scount*size, sdtype, 0,
90+
err = MCA_PML_CALL(send(sbuf, scount*(size_t)size, sdtype, 0,
9091
MCA_COLL_BASE_TAG_SCATTER,
9192
MCA_PML_BASE_SEND_STANDARD, comm));
9293
if (OMPI_SUCCESS != err) {

ompi/mca/coll/inter/coll_inter_scatterv.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
1313
* Copyright (c) 2015-2016 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
15+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -45,7 +46,8 @@ mca_coll_inter_scatterv_inter(const void *sbuf, const int *scounts,
4546
struct ompi_communicator_t *comm,
4647
mca_coll_base_module_t *module)
4748
{
48-
int i, rank, size, err, total=0, size_local;
49+
int i, rank, size, err, size_local;
50+
size_t total = 0;
4951
int *counts=NULL,*displace=NULL;
5052
char *ptmp_free=NULL, *ptmp=NULL;
5153
ompi_datatype_t *ndtype;

0 commit comments

Comments
 (0)