Skip to content

Commit e25e897

Browse files
authored
Merge pull request #12758 from Akshay-Venkatesh/topic/main/reduce-local-impl
ompi/coll/accelerator: implement reduce_local
2 parents 1afb524 + 7f6f788 commit e25e897

File tree

3 files changed

+68
-0
lines changed

3 files changed

+68
-0
lines changed

ompi/mca/coll/accelerator/coll_accelerator.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/*
2+
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
23
* Copyright (c) 2014 The University of Tennessee and The University
34
* of Tennessee Research Foundation. All rights
45
* reserved.
@@ -45,6 +46,11 @@ mca_coll_accelerator_allreduce(const void *sbuf, void *rbuf, size_t count,
4546
struct ompi_communicator_t *comm,
4647
mca_coll_base_module_t *module);
4748

49+
int mca_coll_accelerator_reduce_local(const void *sbuf, void *rbuf, size_t count,
50+
struct ompi_datatype_t *dtype,
51+
struct ompi_op_t *op,
52+
mca_coll_base_module_t *module);
53+
4854
int mca_coll_accelerator_reduce(const void *sbuf, void *rbuf, size_t count,
4955
struct ompi_datatype_t *dtype,
5056
struct ompi_op_t *op,

ompi/mca/coll/accelerator/coll_accelerator_module.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/*
2+
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
23
* Copyright (c) 2014-2017 The University of Tennessee and The University
34
* of Tennessee Research Foundation. All rights
45
* reserved.
@@ -94,6 +95,7 @@ mca_coll_accelerator_comm_query(struct ompi_communicator_t *comm,
9495

9596
accelerator_module->super.coll_allreduce = mca_coll_accelerator_allreduce;
9697
accelerator_module->super.coll_reduce = mca_coll_accelerator_reduce;
98+
accelerator_module->super.coll_reduce_local = mca_coll_accelerator_reduce_local;
9799
accelerator_module->super.coll_reduce_scatter_block = mca_coll_accelerator_reduce_scatter_block;
98100
if (!OMPI_COMM_IS_INTER(comm)) {
99101
accelerator_module->super.coll_scan = mca_coll_accelerator_scan;
@@ -141,6 +143,7 @@ mca_coll_accelerator_module_enable(mca_coll_base_module_t *module,
141143

142144
ACCELERATOR_INSTALL_COLL_API(comm, s, allreduce);
143145
ACCELERATOR_INSTALL_COLL_API(comm, s, reduce);
146+
ACCELERATOR_INSTALL_COLL_API(comm, s, reduce_local);
144147
ACCELERATOR_INSTALL_COLL_API(comm, s, reduce_scatter_block);
145148
if (!OMPI_COMM_IS_INTER(comm)) {
146149
/* MPI does not define scan/exscan on intercommunicators */
@@ -159,6 +162,7 @@ mca_coll_accelerator_module_disable(mca_coll_base_module_t *module,
159162

160163
ACCELERATOR_UNINSTALL_COLL_API(comm, s, allreduce);
161164
ACCELERATOR_UNINSTALL_COLL_API(comm, s, reduce);
165+
ACCELERATOR_UNINSTALL_COLL_API(comm, s, reduce_local);
162166
ACCELERATOR_UNINSTALL_COLL_API(comm, s, reduce_scatter_block);
163167
if (!OMPI_COMM_IS_INTER(comm))
164168
{

ompi/mca/coll/accelerator/coll_accelerator_reduce.c

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/*
2+
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
23
* Copyright (c) 2004-2023 The University of Tennessee and The University
34
* of Tennessee Research Foundation. All rights
45
* reserved.
@@ -84,3 +85,60 @@ mca_coll_accelerator_reduce(const void *sbuf, void *rbuf, size_t count,
8485
}
8586
return rc;
8687
}
88+
89+
int
90+
mca_coll_accelerator_reduce_local(const void *sbuf, void *rbuf, size_t count,
91+
struct ompi_datatype_t *dtype,
92+
struct ompi_op_t *op,
93+
mca_coll_base_module_t *module)
94+
{
95+
ptrdiff_t gap;
96+
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL;
97+
size_t bufsize;
98+
int rc;
99+
100+
bufsize = opal_datatype_span(&dtype->super, count, &gap);
101+
102+
rc = mca_coll_accelerator_check_buf((void *)sbuf);
103+
if (rc < 0) {
104+
return rc;
105+
}
106+
107+
if ((MPI_IN_PLACE != sbuf) && (rc > 0)) {
108+
sbuf1 = (char*)malloc(bufsize);
109+
if (NULL == sbuf1) {
110+
return OMPI_ERR_OUT_OF_RESOURCE;
111+
}
112+
mca_coll_accelerator_memcpy(sbuf1, sbuf, bufsize);
113+
sbuf = sbuf1 - gap;
114+
}
115+
116+
rc = mca_coll_accelerator_check_buf(rbuf);
117+
if (rc < 0) {
118+
return rc;
119+
}
120+
121+
if (rc > 0) {
122+
rbuf1 = (char*)malloc(bufsize);
123+
if (NULL == rbuf1) {
124+
if (NULL != sbuf1) free(sbuf1);
125+
return OMPI_ERR_OUT_OF_RESOURCE;
126+
}
127+
mca_coll_accelerator_memcpy(rbuf1, rbuf, bufsize);
128+
rbuf2 = rbuf; /* save away original buffer */
129+
rbuf = rbuf1 - gap;
130+
}
131+
132+
ompi_op_reduce(op, (void *)sbuf, rbuf, count, dtype);
133+
rc = OMPI_SUCCESS;
134+
135+
if (NULL != sbuf1) {
136+
free(sbuf1);
137+
}
138+
if (NULL != rbuf1) {
139+
rbuf = rbuf2;
140+
mca_coll_accelerator_memcpy(rbuf, rbuf1, bufsize);
141+
free(rbuf1);
142+
}
143+
return rc;
144+
}

0 commit comments

Comments
 (0)