Skip to content

Commit c650c92

Browse files
authored
Merge pull request #12838 from Akshay-Venkatesh/topic/5.0.x/reduce-local-impl
5.0.x/ompi/coll/cuda: Implement reduce local
2 parents da2c8fd + e3ad86e commit c650c92

File tree

3 files changed

+65
-0
lines changed

3 files changed

+65
-0
lines changed

ompi/mca/coll/cuda/coll_cuda.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ int mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
5454
struct ompi_communicator_t *comm,
5555
mca_coll_base_module_t *module);
5656

57+
int mca_coll_cuda_reduce_local(const void *sbuf, void *rbuf, size_t count,
58+
struct ompi_datatype_t *dtype,
59+
struct ompi_op_t *op,
60+
mca_coll_base_module_t *module);
61+
5762
int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count,
5863
struct ompi_datatype_t *dtype,
5964
struct ompi_op_t *op,

ompi/mca/coll/cuda/coll_cuda_module.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ static void mca_coll_cuda_module_destruct(mca_coll_cuda_module_t *module)
4040
{
4141
OBJ_RELEASE(module->c_coll.coll_allreduce_module);
4242
OBJ_RELEASE(module->c_coll.coll_reduce_module);
43+
OBJ_RELEASE(module->c_coll.coll_reduce_local_module);
4344
OBJ_RELEASE(module->c_coll.coll_reduce_scatter_block_module);
4445
OBJ_RELEASE(module->c_coll.coll_scatter_module);
4546
/* If the exscan module is not NULL, then this was an
@@ -103,6 +104,7 @@ mca_coll_cuda_comm_query(struct ompi_communicator_t *comm,
103104
cuda_module->super.coll_gather = NULL;
104105
cuda_module->super.coll_gatherv = NULL;
105106
cuda_module->super.coll_reduce = mca_coll_cuda_reduce;
107+
cuda_module->super.coll_reduce_local = mca_coll_cuda_reduce_local;
106108
cuda_module->super.coll_reduce_scatter = NULL;
107109
cuda_module->super.coll_reduce_scatter_block = mca_coll_cuda_reduce_scatter_block;
108110
cuda_module->super.coll_scan = mca_coll_cuda_scan;
@@ -135,6 +137,7 @@ int mca_coll_cuda_module_enable(mca_coll_base_module_t *module,
135137

136138
CHECK_AND_RETAIN(comm, s, allreduce);
137139
CHECK_AND_RETAIN(comm, s, reduce);
140+
CHECK_AND_RETAIN(comm, s, reduce_local);
138141
CHECK_AND_RETAIN(comm, s, reduce_scatter_block);
139142
CHECK_AND_RETAIN(comm, s, scatter);
140143
if (!OMPI_COMM_IS_INTER(comm)) {

ompi/mca/coll/cuda/coll_cuda_reduce.c

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,60 @@ mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
8383
}
8484
return rc;
8585
}
86+
87+
int
88+
mca_coll_cuda_reduce_local(const void *sbuf, void *rbuf, size_t count,
89+
struct ompi_datatype_t *dtype,
90+
struct ompi_op_t *op,
91+
mca_coll_base_module_t *module)
92+
{
93+
ptrdiff_t gap;
94+
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL;
95+
size_t bufsize;
96+
int rc;
97+
98+
bufsize = opal_datatype_span(&dtype->super, count, &gap);
99+
100+
rc = mca_coll_cuda_check_buf((void *)sbuf);
101+
if (rc < 0) {
102+
return rc;
103+
}
104+
105+
if ((MPI_IN_PLACE != sbuf) && (rc > 0)) {
106+
sbuf1 = (char*)malloc(bufsize);
107+
if (NULL == sbuf1) {
108+
return OMPI_ERR_OUT_OF_RESOURCE;
109+
}
110+
mca_coll_cuda_memcpy(sbuf1, sbuf, bufsize);
111+
sbuf = sbuf1 - gap;
112+
}
113+
114+
rc = mca_coll_cuda_check_buf(rbuf);
115+
if (rc < 0) {
116+
return rc;
117+
}
118+
119+
if (rc > 0) {
120+
rbuf1 = (char*)malloc(bufsize);
121+
if (NULL == rbuf1) {
122+
if (NULL != sbuf1) free(sbuf1);
123+
return OMPI_ERR_OUT_OF_RESOURCE;
124+
}
125+
mca_coll_cuda_memcpy(rbuf1, rbuf, bufsize);
126+
rbuf2 = rbuf; /* save away original buffer */
127+
rbuf = rbuf1 - gap;
128+
}
129+
130+
ompi_op_reduce(op, (void *)sbuf, rbuf, count, dtype);
131+
rc = OMPI_SUCCESS;
132+
133+
if (NULL != sbuf1) {
134+
free(sbuf1);
135+
}
136+
if (NULL != rbuf1) {
137+
rbuf = rbuf2;
138+
mca_coll_cuda_memcpy(rbuf, rbuf1, bufsize);
139+
free(rbuf1);
140+
}
141+
return rc;
142+
}

0 commit comments

Comments
 (0)