Skip to content

Commit cd8d696

Browse files
committed
coll/libnbc: checkpoint
fix MPI_IN_PLACE and temporary buffer allocation/usage Thanks Yuki Matsumoto for the report
1 parent 227d15a commit cd8d696

File tree

2 files changed

+40
-19
lines changed

2 files changed

+40
-19
lines changed

ompi/mca/coll/libnbc/nbc_iallreduce.c

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
#include <assert.h>
2222

2323
static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf,
24-
void *recvbuf, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle);
24+
void *recvbuf, MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle);
2525
static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf,
2626
void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule,
2727
NBC_Handle *handle);
@@ -63,6 +63,7 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M
6363
char inplace;
6464
NBC_Handle *handle;
6565
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
66+
ptrdiff_t span, gap;
6667

6768
NBC_IN_PLACE(sendbuf, recvbuf, inplace);
6869

@@ -86,7 +87,8 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M
8687
return res;
8788
}
8889

89-
handle->tmpbuf = malloc (ext * count);
90+
span = opal_datatype_span(&datatype->super, count, &gap);
91+
handle->tmpbuf = malloc (span);
9092
if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) {
9193
NBC_Return_handle (handle);
9294
return OMPI_ERR_OUT_OF_RESOURCE;
@@ -129,7 +131,7 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M
129131

130132
switch(alg) {
131133
case NBC_ARED_BINOMIAL:
132-
res = allred_sched_diss(rank, p, count, datatype, sendbuf, recvbuf, op, schedule, handle);
134+
res = allred_sched_diss(rank, p, count, datatype, sendbuf, recvbuf, op, inplace, schedule, handle);
133135
break;
134136
case NBC_ARED_RING:
135137
res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, handle);
@@ -298,25 +300,33 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co
298300
if (vrank == root) rank = 0; \
299301
}
300302
static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf,
301-
MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) {
303+
MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle) {
302304
int root, vrank, maxr, vpeer, peer, res;
303305
char *rbuf, *lbuf, *buf;
304306
int tmprbuf, tmplbuf;
307+
ptrdiff_t gap;
308+
(void)opal_datatype_span(&datatype->super, count, &gap);
305309

306310
root = 0; /* this makes the code for ireduce and iallreduce nearly identical - could be changed to improve performance */
307311
RANK2VRANK(rank, vrank, root);
308312
maxr = (int)ceil((log((double)p)/LOG2));
309313
/* ensure the result ends up in recvbuf on vrank 0 */
310314
if (0 == (maxr%2)) {
311-
rbuf = 0;
315+
rbuf = (void *)(-gap);
312316
tmprbuf = true;
313317
lbuf = recvbuf;
314318
tmplbuf = false;
315319
} else {
316-
lbuf = 0;
320+
lbuf = (void *)(-gap);
317321
tmplbuf = true;
318322
rbuf = recvbuf;
319323
tmprbuf = false;
324+
if (inplace) {
325+
res = NBC_Copy(rbuf, count, datatype, ((char *)handle->tmpbuf) - gap, count, datatype, MPI_COMM_SELF);
326+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
327+
return res;
328+
}
329+
}
320330
}
321331

322332
for (int r = 1, firstred = 1 ; r <= maxr ; ++r) {
@@ -332,7 +342,7 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat
332342
}
333343

334344
/* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */
335-
if (firstred && MPI_IN_PLACE != sendbuf) {
345+
if (firstred && !inplace) {
336346
/* perform the reduce with the senbuf */
337347
res = NBC_Sched_op2 (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule, true);
338348
firstred = 0;
@@ -351,7 +361,7 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat
351361
/* we have to send this round */
352362
vpeer = vrank - (1 << (r - 1));
353363
VRANK2RANK(peer, vpeer, root)
354-
if (firstred && MPI_IN_PLACE != sendbuf) {
364+
if (firstred && !inplace) {
355365
/* we have to use the sendbuf in the first round .. */
356366
res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false);
357367
} else {

ompi/mca/coll/libnbc/nbc_ireduce.c

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
#include "nbc_internal.h"
2020

2121
static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, int count, MPI_Datatype datatype,
22-
MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle);
22+
MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle);
2323
static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
2424
MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize);
2525

@@ -58,6 +58,7 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_
5858
enum { NBC_RED_BINOMIAL, NBC_RED_CHAIN } alg;
5959
NBC_Handle *handle;
6060
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
61+
ptrdiff_t span, gap;
6162

6263
NBC_IN_PLACE(sendbuf, recvbuf, inplace);
6364

@@ -92,20 +93,22 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_
9293
return res;
9394
}
9495

96+
span = opal_datatype_span(&datatype->super, count, &gap);
97+
9598
/* algorithm selection */
9699
if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) {
97100
alg = NBC_RED_BINOMIAL;
98101
if(rank == root) {
99102
/* root reduces in receivebuffer */
100-
handle->tmpbuf = malloc (ext * count);
103+
handle->tmpbuf = malloc (span);
101104
redbuf = recvbuf;
102105
} else {
103106
/* recvbuf may not be valid on non-root nodes */
104-
handle->tmpbuf = malloc (ext * count * 2);
105-
redbuf = (char*) handle->tmpbuf + ext * count;
107+
handle->tmpbuf = malloc (2*span);
108+
redbuf = (char*) handle->tmpbuf + span - gap;
106109
}
107110
} else {
108-
handle->tmpbuf = malloc (ext * count);
111+
handle->tmpbuf = malloc (span);
109112
alg = NBC_RED_CHAIN;
110113
segsize = 16384/2;
111114
}
@@ -139,7 +142,7 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_
139142

140143
switch(alg) {
141144
case NBC_RED_BINOMIAL:
142-
res = red_sched_binomial(rank, p, root, sendbuf, redbuf, count, datatype, op, schedule, handle);
145+
res = red_sched_binomial(rank, p, root, sendbuf, redbuf, count, datatype, op, inplace, schedule, handle);
143146
break;
144147
case NBC_RED_CHAIN:
145148
res = red_sched_chain(rank, p, root, sendbuf, recvbuf, count, datatype, op, ext, size, schedule, handle, segsize);
@@ -292,10 +295,12 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count
292295
if (vrank == root) rank = 0; \
293296
}
294297
static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, int count, MPI_Datatype datatype,
295-
MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) {
298+
MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle) {
296299
int vroot, vrank, vpeer, peer, res, maxr;
297300
char *rbuf, *lbuf, *buf;
298301
int tmprbuf, tmplbuf;
302+
ptrdiff_t gap;
303+
(void)opal_datatype_span(&datatype->super, count, &gap);
299304

300305
if (ompi_op_is_commute(op)) {
301306
vroot = root;
@@ -307,15 +312,21 @@ static inline int red_sched_binomial (int rank, int p, int root, const void *sen
307312

308313
/* ensure the result ends up in redbuf on vrank 0 */
309314
if (0 == (maxr%2)) {
310-
rbuf = 0;
315+
rbuf = (void *)(-gap);
311316
tmprbuf = true;
312317
lbuf = redbuf;
313318
tmplbuf = false;
314319
} else {
315-
lbuf = 0;
320+
lbuf = (void *)(-gap);
316321
tmplbuf = true;
317322
rbuf = redbuf;
318323
tmprbuf = false;
324+
if (inplace) {
325+
res = NBC_Copy(rbuf, count, datatype, ((char *)handle->tmpbuf)-gap, count, datatype, MPI_COMM_SELF);
326+
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
327+
return res;
328+
}
329+
}
319330
}
320331

321332
for (int r = 1, firstred = 1 ; r <= maxr ; ++r) {
@@ -332,7 +343,7 @@ static inline int red_sched_binomial (int rank, int p, int root, const void *sen
332343

333344
/* perform the reduce in my local buffer */
334345
/* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */
335-
if (firstred && MPI_IN_PLACE != sendbuf) {
346+
if (firstred && !inplace) {
336347
/* perform the reduce with the senbuf */
337348
res = NBC_Sched_op2 (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule, true);
338349
firstred = 0;
@@ -352,7 +363,7 @@ static inline int red_sched_binomial (int rank, int p, int root, const void *sen
352363
/* we have to send this round */
353364
vpeer = vrank - (1 << (r - 1));
354365
VRANK2RANK(peer, vpeer, vroot)
355-
if (firstred && MPI_IN_PLACE != sendbuf) {
366+
if (firstred && !inplace) {
356367
/* we have to use the sendbuf in the first round .. */
357368
res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false);
358369
} else {

0 commit comments

Comments
 (0)