From a59b7e602d4d54c2fffeaf3169e430caa2ff4bc2 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Tue, 7 Jun 2016 11:07:26 +0900 Subject: [PATCH 01/14] coll/libnbc: fix allreduce allred_sched_diss algo allred_sched_diss was broken : - use ompi_op_reduce() for Fortran/C++ support - use ompi_op_reduce() keeping in mind op might not be commutative - fix buffer overwrite caused by incorrect usage of ompi_3buff_op_user() Thanks Yuki Matsumoto for the report --- ompi/mca/coll/libnbc/nbc.c | 47 ++++++++++++++++++++++++++- ompi/mca/coll/libnbc/nbc_iallreduce.c | 31 ++++++++++++++---- ompi/mca/coll/libnbc/nbc_internal.h | 7 +++- 3 files changed, 76 insertions(+), 9 deletions(-) diff --git a/ompi/mca/coll/libnbc/nbc.c b/ompi/mca/coll/libnbc/nbc.c index ef9ace0ec4a..81e59a80cc8 100644 --- a/ompi/mca/coll/libnbc/nbc.c +++ b/ompi/mca/coll/libnbc/nbc.c @@ -10,7 +10,7 @@ * rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * Author(s): Torsten Hoefler @@ -158,6 +158,33 @@ int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, in return OMPI_SUCCESS; } +/* this function puts an operation into the schedule */ +int NBC_Sched_op2 (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, + MPI_Op op, NBC_Schedule *schedule, bool barrier) { + NBC_Args_op op_args; + int ret; + + /* store the passed arguments */ + op_args.type = OP2; + op_args.buf1 = buf1; + op_args.buf2 = buf2; + op_args.tmpbuf1 = tmpbuf1; + op_args.tmpbuf2 = tmpbuf2; + op_args.count = count; + op_args.op = op; + op_args.datatype = datatype; + + /* append to the round-schedule */ + ret = nbc_schedule_round_append (schedule, &op_args, sizeof (op_args), barrier); + if (OMPI_SUCCESS != ret) { + return ret; + } + + NBC_DEBUG(10, "added op2 - ends at byte %i\n", nbc_schedule_get_size (schedule)); + + return OMPI_SUCCESS; +} + /* this function puts an operation into the schedule */ int NBC_Sched_op (void *buf3, char tmpbuf3, const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, bool barrier) { @@ -478,6 +505,24 @@ static inline int NBC_Start_round(NBC_Handle *handle) { } ompi_3buff_op_reduce(opargs.op, buf1, buf2, buf3, opargs.count, opargs.datatype); break; + case OP2: + NBC_DEBUG(5, " OP2 (offset %li) ", offset); + NBC_GET_BYTES(ptr,opargs); + NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2, + opargs.count, opargs.datatype); + /* get buffers */ + if(opargs.tmpbuf1) { + buf1=(char*)handle->tmpbuf+(long)opargs.buf1; + } else { + buf1=(void *)opargs.buf1; + } + if(opargs.tmpbuf2) { + buf2=(char*)handle->tmpbuf+(long)opargs.buf2; + } else { + buf2=opargs.buf2; + } + ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype); + break; case COPY: NBC_DEBUG(5, " COPY (offset %li) ", offset); NBC_GET_BYTES(ptr,copyargs); diff --git a/ompi/mca/coll/libnbc/nbc_iallreduce.c b/ompi/mca/coll/libnbc/nbc_iallreduce.c index 2e1b0dd00b6..b7814494134 100644 --- a/ompi/mca/coll/libnbc/nbc_iallreduce.c +++ b/ompi/mca/coll/libnbc/nbc_iallreduce.c @@ -7,7 +7,7 @@ * rights reserved. * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * Author(s): Torsten Hoefler @@ -299,10 +299,24 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { int root, vrank, maxr, vpeer, peer, res; + char *rbuf, *lbuf, *buf; + int tmprbuf, tmplbuf; root = 0; /* this makes the code for ireduce and iallreduce nearly identical - could be changed to improve performance */ RANK2VRANK(rank, vrank, root); maxr = (int)ceil((log((double)p)/LOG2)); + /* ensure the result ends up in recvbuf on vrank 0 */ + if (0 == (maxr%2)) { + rbuf = 0; + tmprbuf = true; + lbuf = recvbuf; + tmplbuf = false; + } else { + lbuf = 0; + tmplbuf = true; + rbuf = recvbuf; + tmprbuf = false; + } for (int r = 1, firstred = 1 ; r <= maxr ; ++r) { if ((vrank % (1 << r)) == 0) { @@ -311,7 +325,7 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat VRANK2RANK(peer, vpeer, root) if (peer < p) { /* we have to wait until we have the data */ - res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); + res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } @@ -319,16 +333,18 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ if (firstred && MPI_IN_PLACE != sendbuf) { /* perform the reduce with the senbuf */ - res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, true); + res = NBC_Sched_op2 (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule, true); firstred = 0; } else { /* perform the reduce in my local buffer */ - res = NBC_Sched_op (recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule, true); + res = NBC_Sched_op2 (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true); } - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; + tmprbuf ^= 1; tmplbuf ^= 1; } } else { /* we have to send this round */ @@ -338,8 +354,8 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat /* we have to use the sendbuf in the first round .. */ res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); } else { - /* and the recvbuf in all remeining rounds */ - res = NBC_Sched_send (recvbuf, false, count, datatype, peer, schedule, false); + /* and the recvbuf in all remaining rounds */ + res = NBC_Sched_send (lbuf, tmplbuf, count, datatype, peer, schedule, false); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { @@ -373,6 +389,7 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat } } + if (0 == vrank) assert(lbuf == recvbuf); /* now send to the right hosts */ for (int r = 0; r < maxr; ++r) { if (((vrank + (1 << r) < p) && (vrank < (1 << r))) || (vrank == 0)) { diff --git a/ompi/mca/coll/libnbc/nbc_internal.h b/ompi/mca/coll/libnbc/nbc_internal.h index 7b7e3210f9b..3dc49dede09 100644 --- a/ompi/mca/coll/libnbc/nbc_internal.h +++ b/ompi/mca/coll/libnbc/nbc_internal.h @@ -10,7 +10,7 @@ * * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. @@ -80,6 +80,7 @@ typedef enum { SEND, RECV, OP, + OP2, COPY, UNPACK } NBC_Fn_type; @@ -147,6 +148,8 @@ int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype dataty int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier); int NBC_Sched_op (void* buf3, char tmpbuf3, const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, bool barrier); +int NBC_Sched_op2 (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, + MPI_Op op, NBC_Schedule *schedule, bool barrier); int NBC_Sched_copy (void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount, MPI_Datatype tgttype, NBC_Schedule *schedule, bool barrier); int NBC_Sched_unpack (void *inbuf, char tmpinbuf, int count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf, @@ -317,6 +320,7 @@ static inline void nbc_get_round_size (char *p, unsigned long *size) { offset += sizeof(NBC_Args_recv); break; case OP: + case OP2: /*printf("found a OP at offset %li\n", (long)p-(long)schedule); */ offset += sizeof(NBC_Args_op); \ break; @@ -393,6 +397,7 @@ static inline void nbc_schedule_inc_round (NBC_Schedule *schedule) { printf("*buf: %lu, count: %i, type: %lu, source: %i)\n", (unsigned long)recvargs.buf, recvargs.count, (unsigned long)recvargs.datatype, recvargs.source); \ break; \ case OP: \ + case OP2: \ printf("[%i] OP (offset %li) ", myrank, (long)p-(long)schedule); \ NBC_GET_BYTES(p,opargs); \ printf("*buf1: %lu, buf2: %lu, count: %i, type: %lu)\n", (unsigned long)opargs.buf1, (unsigned long)opargs.buf2, opargs.count, (unsigned long)opargs.datatype); \ From 13c4a9122c354719cfb30447e17fe6dfd11af5f6 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Tue, 7 Jun 2016 11:10:21 +0900 Subject: [PATCH 02/14] coll/libnbc: always use the allreduce allred_sched_diss algo with non commutative op allred_sched_ring algo simply does not work with non commutative op. Thanks Yuki Matsumoto for the report --- ompi/mca/coll/libnbc/nbc_iallreduce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ompi/mca/coll/libnbc/nbc_iallreduce.c b/ompi/mca/coll/libnbc/nbc_iallreduce.c index b7814494134..ee99b1a2853 100644 --- a/ompi/mca/coll/libnbc/nbc_iallreduce.c +++ b/ompi/mca/coll/libnbc/nbc_iallreduce.c @@ -16,6 +16,7 @@ #include "nbc_internal.h" #include "ompi/communicator/communicator.h" #include "ompi/datatype/ompi_datatype.h" +#include "ompi/op/op.h" #include @@ -101,7 +102,7 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M } /* algorithm selection */ - if(p < 4 || size*count < 65536 || inplace) { + if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) { alg = NBC_ARED_BINOMIAL; } else { alg = NBC_ARED_RING; From c1cc750ffa85c70699c5ef55ec3a8dd910fa4447 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Tue, 7 Jun 2016 14:08:49 +0900 Subject: [PATCH 03/14] coll/libnbc: fix reduce red_sched_binomial algo red_sched_binomial was broken : - use ompi_op_reduce() for Fortran/C++ support - use ompi_op_reduce() keeping in mind op might not be commutative - fix buffer overwrite caused by incorrect usage of ompi_3buff_op_user() - if op is not commutative, reduce on rank 0 and then send the result to root --- ompi/mca/coll/libnbc/nbc_ireduce.c | 94 ++++++++++++++++++------------ 1 file changed, 58 insertions(+), 36 deletions(-) diff --git a/ompi/mca/coll/libnbc/nbc_ireduce.c b/ompi/mca/coll/libnbc/nbc_ireduce.c index 0045deb6a54..69b88d3682e 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce.c @@ -7,16 +7,19 @@ * rights reserved. * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * Author(s): Torsten Hoefler * */ + +#include "ompi/op/op.h" + #include "nbc_internal.h" -static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, - MPI_Op op, void *redbuf, NBC_Schedule *schedule, NBC_Handle *handle); +static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, int count, MPI_Datatype datatype, + MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle); static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize); @@ -95,6 +98,7 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_ if(rank == root) { /* root reduces in receivebuffer */ handle->tmpbuf = malloc (ext * count); + redbuf = recvbuf; } else { /* recvbuf may not be valid on non-root nodes */ handle->tmpbuf = malloc (ext * count * 2); @@ -135,7 +139,7 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_ switch(alg) { case NBC_RED_BINOMIAL: - res = red_sched_binomial(rank, p, root, sendbuf, recvbuf, count, datatype, op, redbuf, schedule, handle); + res = red_sched_binomial(rank, p, root, sendbuf, redbuf, count, datatype, op, schedule, handle); break; case NBC_RED_CHAIN: res = red_sched_chain(rank, p, root, sendbuf, recvbuf, count, datatype, op, ext, size, schedule, handle, segsize); @@ -257,6 +261,8 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count /* binomial reduce + * if op is not commutative, reduce on rank 0, and then send the result to root rank + * * working principle: * - each node gets a virtual rank vrank * - the 'root' node get vrank 0 @@ -285,65 +291,73 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count if (vrank == 0) rank = root; \ if (vrank == root) rank = 0; \ } -static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, - MPI_Op op, void *redbuf, NBC_Schedule *schedule, NBC_Handle *handle) { - int vrank, vpeer, peer, res, maxr; - - RANK2VRANK(rank, vrank, root); +static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, int count, MPI_Datatype datatype, + MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { + int vroot, vrank, vpeer, peer, res, maxr; + char *rbuf, *lbuf, *buf; + int tmprbuf, tmplbuf; + + if (ompi_op_is_commute(op)) { + vroot = root; + } else { + vroot = 0; + } + RANK2VRANK(rank, vrank, vroot); maxr = (int)ceil((log((double)p)/LOG2)); + /* ensure the result ends up in redbuf on vrank 0 */ + if (0 == (maxr%2)) { + rbuf = 0; + tmprbuf = true; + lbuf = redbuf; + tmplbuf = false; + } else { + lbuf = 0; + tmplbuf = true; + rbuf = redbuf; + tmprbuf = false; + } + for (int r = 1, firstred = 1 ; r <= maxr ; ++r) { if ((vrank % (1 << r)) == 0) { /* we have to receive this round */ vpeer = vrank + (1 << (r - 1)); - VRANK2RANK(peer, vpeer, root) + VRANK2RANK(peer, vpeer, vroot) if (peer < p) { /* we have to wait until we have the data */ - res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); + res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } /* perform the reduce in my local buffer */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ - if (firstred) { - if (rank == root) { - /* root is the only one who reduces in the receivebuffer - * take data from sendbuf in first round - save copy */ - res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, true); - } else { - /* all others may not have a receive buffer - * take data from sendbuf in first round - save copy */ - res = NBC_Sched_op ((char *) redbuf - (intptr_t) handle->tmpbuf, true, sendbuf, false, 0, true, count, - datatype, op, schedule, true); - } + if (firstred && MPI_IN_PLACE != sendbuf) { + /* perform the reduce with the senbuf */ + res = NBC_Sched_op2 (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule, true); firstred = 0; } else { - if(rank == root) { - /* root is the only one who reduces in the receivebuffer */ - res = NBC_Sched_op (recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule, true); - } else { - /* all others may not have a receive buffer */ - res = NBC_Sched_op ((char *) redbuf - (intptr_t) handle->tmpbuf, true, (char *) redbuf - (intptr_t) handle->tmpbuf, - true, 0, true, count, datatype, op, schedule, true); - } + /* perform the reduce in my local buffer */ + res = NBC_Sched_op2 (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; + tmprbuf ^= 1; tmplbuf ^= 1; } } else { /* we have to send this round */ vpeer = vrank - (1 << (r - 1)); - VRANK2RANK(peer, vpeer, root) - if (firstred) { - /* we did not reduce anything */ + VRANK2RANK(peer, vpeer, vroot) + if (firstred && MPI_IN_PLACE != sendbuf) { + /* we have to use the sendbuf in the first round .. */ res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); } else { - /* we have to use the redbuf the root (which works in receivebuf) is never sending .. */ - res = NBC_Sched_send ((char *) redbuf - (intptr_t) handle->tmpbuf, true, count, datatype, peer, schedule, - false); + /* and the redbuf in all remaining rounds */ + res = NBC_Sched_send (lbuf, tmplbuf, count, datatype, peer, schedule, false); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { @@ -354,6 +368,14 @@ static inline int red_sched_binomial (int rank, int p, int root, const void *sen break; } } + /* send to root if vroot ! root */ + if (vroot != root) { + if (0 == rank) { + res = NBC_Sched_send (redbuf, false, count, datatype, root, schedule, false); + } else if (root == rank) { + res = NBC_Sched_recv (redbuf, false, count, datatype, vroot, schedule, false); + } + } return OMPI_SUCCESS; } From 395c3e015fb4d8eb09fc22831c13d608753e8e96 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Tue, 7 Jun 2016 14:11:34 +0900 Subject: [PATCH 04/14] coll/libnbc: always use the reduce allred_sched_binomial algo with non commutative op red_sched_chain algo simply does not work with non commutative op. --- ompi/mca/coll/libnbc/nbc_ireduce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ompi/mca/coll/libnbc/nbc_ireduce.c b/ompi/mca/coll/libnbc/nbc_ireduce.c index 69b88d3682e..cf479b241bc 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce.c @@ -93,7 +93,7 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_ } /* algorithm selection */ - if (p > 4 || size * count < 65536) { + if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) { alg = NBC_RED_BINOMIAL; if(rank == root) { /* root reduces in receivebuffer */ From 9662e6fee038b2199e64ebca67fb8e96d4d422fa Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Tue, 7 Jun 2016 15:43:15 +0900 Subject: [PATCH 05/14] coll/libnbc: fix reduce_scatter algo reduce_scatter was broken : - use ompi_op_reduce() for Fortran/C++ support - use ompi_op_reduce() keeping in mind op might not be commutative - fix buffer overwrite caused by incorrect usage of ompi_3buff_op_user() --- ompi/mca/coll/libnbc/nbc_ireduce_scatter.c | 28 ++++++++++++---------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c index cd1dad14e76..902061dd55f 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c @@ -7,7 +7,7 @@ * rights reserved. * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -40,10 +40,11 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i struct mca_coll_base_module_2_1_0_t *module) { int peer, rank, maxr, p, res, count; MPI_Aint ext; - char *redbuf, *sbuf, inplace; + char *sbuf, inplace; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + char *rbuf, *lbuf, *buf; NBC_IN_PLACE(sendbuf, recvbuf, inplace); @@ -87,7 +88,8 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i return OMPI_ERR_OUT_OF_RESOURCE; } - redbuf = (char *) handle->tmpbuf + ext * count; + rbuf = 0; + lbuf = (char *)(ext*count); schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { @@ -104,7 +106,7 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i peer = rank + (1 << (r - 1)); if (peer < p) { /* we have to wait until we have the data */ - res = NBC_Sched_recv(0, true, count, datatype, peer, schedule, true); + res = NBC_Sched_recv(rbuf, true, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; @@ -113,19 +115,19 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ if (firstred) { /* take reduce data from the sendbuf in the first round -> save copy */ - res = NBC_Sched_op (redbuf - (intptr_t) handle->tmpbuf, true, sendbuf, false, 0, true, count, datatype, - op, schedule, true); + res = NBC_Sched_op2 (sendbuf, false, rbuf, true, count, datatype, op, schedule, true); firstred = 0; } else { /* perform the reduce in my local buffer */ - res = NBC_Sched_op (redbuf - (intptr_t) handle->tmpbuf, true, redbuf - (intptr_t) handle->tmpbuf, true, - 0, true, count, datatype, op, schedule, true); + res = NBC_Sched_op2 (lbuf, true, rbuf, true, count, datatype, op, schedule, true); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; } } else { /* we have to send this round */ @@ -134,8 +136,8 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i /* we have to send the senbuf */ res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); } else { - /* we send an already reduced value from redbuf */ - res = NBC_Sched_send (redbuf - (intptr_t) handle->tmpbuf, true, count, datatype, peer, schedule, false); + /* we send an already reduced value from lbuf */ + res = NBC_Sched_send (lbuf, true, count, datatype, peer, schedule, false); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); @@ -157,9 +159,9 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i if (rank == 0) { for (long int r = 1, offset = 0 ; r < p ; ++r) { offset += recvcounts[r-1]; - sbuf = redbuf + offset * ext; + sbuf = lbuf + (offset*ext); /* root sends the right buffer to the right receiver */ - res = NBC_Sched_send (sbuf - (intptr_t) handle->tmpbuf, true, recvcounts[r], datatype, r, schedule, + res = NBC_Sched_send (sbuf, true, recvcounts[r], datatype, r, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); @@ -167,7 +169,7 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i } } - res = NBC_Sched_copy (redbuf - (intptr_t) handle->tmpbuf, true, recvcounts[0], datatype, recvbuf, false, + res = NBC_Sched_copy (lbuf, true, recvcounts[0], datatype, recvbuf, false, recvcounts[0], datatype, schedule, false); } else { res = NBC_Sched_recv (recvbuf, false, recvcounts[rank], datatype, 0, schedule, false); From 227d15af8c9cb88439e331d744db65522bfc9446 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Tue, 7 Jun 2016 15:44:18 +0900 Subject: [PATCH 06/14] coll/libnbc: fix reduce_scatter_block algo reduce_scatter_block was broken : - use ompi_op_reduce() for Fortran/C++ support - use ompi_op_reduce() keeping in mind op might not be commutative - fix buffer overwrite caused by incorrect usage of ompi_3buff_op_user() --- .../coll/libnbc/nbc_ireduce_scatter_block.c | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c index 4d553c6f556..3244dc77209 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c @@ -73,6 +73,8 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i count = p * recvcount; if (0 < count) { + char *rbuf, *lbuf, *buf; + handle->tmpbuf = malloc (ext*count*2); if (NULL == handle->tmpbuf) { OMPI_COLL_LIBNBC_REQUEST_RETURN(handle); @@ -80,6 +82,8 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i return OMPI_ERR_OUT_OF_RESOURCE; } + rbuf = 0; + lbuf = (char *)(ext*count); redbuf = (char *) handle->tmpbuf + ext * count; /* copy data to redbuf if we only have a single node */ @@ -98,7 +102,7 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i peer = rank + (1 << (r - 1)); if (peer < p) { /* we have to wait until we have the data */ - res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); + res = NBC_Sched_recv (rbuf, true, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; @@ -106,29 +110,29 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i if (firstred) { /* take reduce data from the sendbuf in the first round -> save copy */ - res = NBC_Sched_op (redbuf-(unsigned long)handle->tmpbuf, true, sendbuf, false, 0, true, count, - datatype, op, schedule, true); + res = NBC_Sched_op2 (sendbuf, false, rbuf, true, count, datatype, op, schedule, true); firstred = 0; } else { /* perform the reduce in my local buffer */ - res = NBC_Sched_op (redbuf-(unsigned long)handle->tmpbuf, true, redbuf-(unsigned long)handle->tmpbuf, - true, 0, true, count, datatype, op, schedule, true); + res = NBC_Sched_op2 (lbuf, true, rbuf, true, count, datatype, op, schedule, true); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; } } else { /* we have to send this round */ peer = rank - (1 << (r - 1)); if(firstred) { /* we have to send the senbuf */ - res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, true); + res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); } else { /* we send an already reduced value from redbuf */ - res = NBC_Sched_send (redbuf-(unsigned long)handle->tmpbuf, true, count, datatype, peer, schedule, true); + res = NBC_Sched_send (lbuf, true, count, datatype, peer, schedule, false); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { @@ -157,16 +161,16 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i } else { for (int r = 1, offset = 0 ; r < p ; ++r) { offset += recvcount; - sbuf = ((char *)redbuf) + (offset*ext); + sbuf = lbuf + (offset*ext); /* root sends the right buffer to the right receiver */ - res = NBC_Sched_send (sbuf-(unsigned long)handle->tmpbuf, true, recvcount, datatype, r, schedule, false); + res = NBC_Sched_send (sbuf, true, recvcount, datatype, r, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } } - res = NBC_Sched_copy (redbuf-(unsigned long)handle->tmpbuf, true, recvcount, datatype, recvbuf, false, recvcount, + res = NBC_Sched_copy (lbuf, true, recvcount, datatype, recvbuf, false, recvcount, datatype, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); From cd8d6969fc7bedb145c40c73c0a355bc178f92f1 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 8 Jun 2016 09:54:46 +0900 Subject: [PATCH 07/14] coll/libnbc: checkpoint fix MPI_IN_PLACE and temporary buffer allocation/usage Thanks Yuki Matsumoto for the report --- ompi/mca/coll/libnbc/nbc_iallreduce.c | 26 ++++++++++++++------- ompi/mca/coll/libnbc/nbc_ireduce.c | 33 ++++++++++++++++++--------- 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/ompi/mca/coll/libnbc/nbc_iallreduce.c b/ompi/mca/coll/libnbc/nbc_iallreduce.c index ee99b1a2853..e7aa5e17257 100644 --- a/ompi/mca/coll/libnbc/nbc_iallreduce.c +++ b/ompi/mca/coll/libnbc/nbc_iallreduce.c @@ -21,7 +21,7 @@ #include static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, - void *recvbuf, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle); + void *recvbuf, MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle); static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule, NBC_Handle *handle); @@ -63,6 +63,7 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M char inplace; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; NBC_IN_PLACE(sendbuf, recvbuf, inplace); @@ -86,7 +87,8 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M return res; } - handle->tmpbuf = malloc (ext * count); + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; @@ -129,7 +131,7 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M switch(alg) { case NBC_ARED_BINOMIAL: - res = allred_sched_diss(rank, p, count, datatype, sendbuf, recvbuf, op, schedule, handle); + res = allred_sched_diss(rank, p, count, datatype, sendbuf, recvbuf, op, inplace, schedule, handle); break; case NBC_ARED_RING: res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, handle); @@ -298,25 +300,33 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co if (vrank == root) rank = 0; \ } static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, - MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { + MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle) { int root, vrank, maxr, vpeer, peer, res; char *rbuf, *lbuf, *buf; int tmprbuf, tmplbuf; + ptrdiff_t gap; + (void)opal_datatype_span(&datatype->super, count, &gap); root = 0; /* this makes the code for ireduce and iallreduce nearly identical - could be changed to improve performance */ RANK2VRANK(rank, vrank, root); maxr = (int)ceil((log((double)p)/LOG2)); /* ensure the result ends up in recvbuf on vrank 0 */ if (0 == (maxr%2)) { - rbuf = 0; + rbuf = (void *)(-gap); tmprbuf = true; lbuf = recvbuf; tmplbuf = false; } else { - lbuf = 0; + lbuf = (void *)(-gap); tmplbuf = true; rbuf = recvbuf; tmprbuf = false; + if (inplace) { + res = NBC_Copy(rbuf, count, datatype, ((char *)handle->tmpbuf) - gap, count, datatype, MPI_COMM_SELF); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } } for (int r = 1, firstred = 1 ; r <= maxr ; ++r) { @@ -332,7 +342,7 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat } /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ - if (firstred && MPI_IN_PLACE != sendbuf) { + if (firstred && !inplace) { /* perform the reduce with the senbuf */ res = NBC_Sched_op2 (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule, true); firstred = 0; @@ -351,7 +361,7 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat /* we have to send this round */ vpeer = vrank - (1 << (r - 1)); VRANK2RANK(peer, vpeer, root) - if (firstred && MPI_IN_PLACE != sendbuf) { + if (firstred && !inplace) { /* we have to use the sendbuf in the first round .. */ res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); } else { diff --git a/ompi/mca/coll/libnbc/nbc_ireduce.c b/ompi/mca/coll/libnbc/nbc_ireduce.c index cf479b241bc..955ed1dbf7f 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce.c @@ -19,7 +19,7 @@ #include "nbc_internal.h" static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, int count, MPI_Datatype datatype, - MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle); + MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle); static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize); @@ -58,6 +58,7 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_ enum { NBC_RED_BINOMIAL, NBC_RED_CHAIN } alg; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; NBC_IN_PLACE(sendbuf, recvbuf, inplace); @@ -92,20 +93,22 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_ return res; } + span = opal_datatype_span(&datatype->super, count, &gap); + /* algorithm selection */ if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) { alg = NBC_RED_BINOMIAL; if(rank == root) { /* root reduces in receivebuffer */ - handle->tmpbuf = malloc (ext * count); + handle->tmpbuf = malloc (span); redbuf = recvbuf; } else { /* recvbuf may not be valid on non-root nodes */ - handle->tmpbuf = malloc (ext * count * 2); - redbuf = (char*) handle->tmpbuf + ext * count; + handle->tmpbuf = malloc (2*span); + redbuf = (char*) handle->tmpbuf + span - gap; } } else { - handle->tmpbuf = malloc (ext * count); + handle->tmpbuf = malloc (span); alg = NBC_RED_CHAIN; segsize = 16384/2; } @@ -139,7 +142,7 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_ switch(alg) { case NBC_RED_BINOMIAL: - res = red_sched_binomial(rank, p, root, sendbuf, redbuf, count, datatype, op, schedule, handle); + res = red_sched_binomial(rank, p, root, sendbuf, redbuf, count, datatype, op, inplace, schedule, handle); break; case NBC_RED_CHAIN: res = red_sched_chain(rank, p, root, sendbuf, recvbuf, count, datatype, op, ext, size, schedule, handle, segsize); @@ -292,10 +295,12 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count if (vrank == root) rank = 0; \ } static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, int count, MPI_Datatype datatype, - MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { + MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle) { int vroot, vrank, vpeer, peer, res, maxr; char *rbuf, *lbuf, *buf; int tmprbuf, tmplbuf; + ptrdiff_t gap; + (void)opal_datatype_span(&datatype->super, count, &gap); if (ompi_op_is_commute(op)) { vroot = root; @@ -307,15 +312,21 @@ static inline int red_sched_binomial (int rank, int p, int root, const void *sen /* ensure the result ends up in redbuf on vrank 0 */ if (0 == (maxr%2)) { - rbuf = 0; + rbuf = (void *)(-gap); tmprbuf = true; lbuf = redbuf; tmplbuf = false; } else { - lbuf = 0; + lbuf = (void *)(-gap); tmplbuf = true; rbuf = redbuf; tmprbuf = false; + if (inplace) { + res = NBC_Copy(rbuf, count, datatype, ((char *)handle->tmpbuf)-gap, count, datatype, MPI_COMM_SELF); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } } for (int r = 1, firstred = 1 ; r <= maxr ; ++r) { @@ -332,7 +343,7 @@ static inline int red_sched_binomial (int rank, int p, int root, const void *sen /* perform the reduce in my local buffer */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ - if (firstred && MPI_IN_PLACE != sendbuf) { + if (firstred && !inplace) { /* perform the reduce with the senbuf */ res = NBC_Sched_op2 (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule, true); firstred = 0; @@ -352,7 +363,7 @@ static inline int red_sched_binomial (int rank, int p, int root, const void *sen /* we have to send this round */ vpeer = vrank - (1 << (r - 1)); VRANK2RANK(peer, vpeer, vroot) - if (firstred && MPI_IN_PLACE != sendbuf) { + if (firstred && !inplace) { /* we have to use the sendbuf in the first round .. */ res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); } else { From 03bc2f3136e6863c4a8da085285f2b3e8278cc90 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 8 Jun 2016 16:48:00 +0900 Subject: [PATCH 08/14] coll/base: fix [all]reduce with non zero lower bound datatypes Offset temporary buffer when a non zero lower bound datatype is used. Thanks Hristo Iliev for the report --- ompi/mca/coll/base/coll_base_allreduce.c | 11 ++++++----- ompi/mca/coll/base/coll_base_reduce.c | 22 ++++++++++++---------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c index c21676af1e4..aa5a62796b4 100644 --- a/ompi/mca/coll/base/coll_base_allreduce.c +++ b/ompi/mca/coll/base/coll_base_allreduce.c @@ -13,7 +13,7 @@ * Copyright (c) 2009 University of Houston. All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All Rights * reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -134,7 +134,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, { int ret, line, rank, size, adjsize, remote, distance; int newrank, newremote, extra_ranks; - char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf = NULL; + char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf_free = NULL, *inplacebuf; ompi_request_t *reqs[2] = {NULL, NULL}; OPAL_PTRDIFF_TYPE span, gap; @@ -155,8 +155,9 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, /* Allocate and initialize temporary send buffer */ span = opal_datatype_span(&dtype->super, count, &gap); - inplacebuf = (char*) malloc(span); - if (NULL == inplacebuf) { ret = -1; line = __LINE__; goto error_hndl; } + inplacebuf_free = (char*) malloc(span); + if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; } + inplacebuf = inplacebuf_free - gap; if (MPI_IN_PLACE == sbuf) { ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)rbuf); @@ -263,7 +264,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, if (ret < 0) { line = __LINE__; goto error_hndl; } } - if (NULL != inplacebuf) free(inplacebuf); + if (NULL != inplacebuf_free) free(inplacebuf_free); return MPI_SUCCESS; error_hndl: diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c index f5b2449727e..23d68387720 100644 --- a/ompi/mca/coll/base/coll_base_reduce.c +++ b/ompi/mca/coll/base/coll_base_reduce.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All Rights * reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -485,6 +485,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv int ret, rank, size, io_root, segcount = count; void *use_this_sendbuf = NULL; void *use_this_recvbuf = NULL; + char *tmpbuf_free = NULL; size_t typelng; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; @@ -515,24 +516,26 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv use_this_recvbuf = recvbuf; if (io_root != root) { ptrdiff_t dsize, gap; - char *tmpbuf = NULL; + char *tmpbuf; dsize = opal_datatype_span(&datatype->super, count, &gap); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { - tmpbuf = (char *) malloc(dsize); - if (NULL == tmpbuf) { + tmpbuf_free = (char *) malloc(dsize); + if (NULL == tmpbuf_free) { return MPI_ERR_INTERN; } + tmpbuf = tmpbuf_free - gap; ompi_datatype_copy_content_same_ddt(datatype, count, (char*)tmpbuf, (char*)recvbuf); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { - tmpbuf = (char *) malloc(dsize); - if (NULL == tmpbuf) { + tmpbuf_free = (char *) malloc(dsize); + if (NULL == tmpbuf_free) { return MPI_ERR_INTERN; } + tmpbuf = tmpbuf_free - gap; use_this_recvbuf = tmpbuf; } } @@ -552,9 +555,6 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { return ret; } - if (MPI_IN_PLACE == sendbuf) { - free(use_this_sendbuf); - } } else if (io_root == rank) { /* Send result from use_this_recvbuf to root */ @@ -562,9 +562,11 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { return ret; } - free(use_this_recvbuf); } } + if (NULL != tmpbuf_free) { + free(tmpbuf_free); + } return MPI_SUCCESS; } From ef60d82b91acdca0ba65f66f3bcec38e95067951 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Fri, 10 Jun 2016 17:06:32 +0900 Subject: [PATCH 09/14] coll/libnbc: fix iallreduce for inter communicators Thanks Yuki Matsumoto for the report --- ompi/mca/coll/libnbc/nbc_iallreduce.c | 43 ++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/ompi/mca/coll/libnbc/nbc_iallreduce.c b/ompi/mca/coll/libnbc/nbc_iallreduce.c index e7aa5e17257..5ff36d43ef7 100644 --- a/ompi/mca/coll/libnbc/nbc_iallreduce.c +++ b/ompi/mca/coll/libnbc/nbc_iallreduce.c @@ -202,6 +202,7 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; rank = ompi_comm_rank (comm); rsize = ompi_comm_remote_size (comm); @@ -223,7 +224,8 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co return res; } - handle->tmpbuf = malloc (ext * count); + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; @@ -620,45 +622,70 @@ static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datat static inline int allred_sched_linear(int rank, int rsize, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle) { int res; + ptrdiff_t span, gap; if (0 == count) { return OMPI_SUCCESS; } + span = opal_datatype_span(&datatype->super, count, &gap); + /* send my data to the remote root */ res = NBC_Sched_send (sendbuf, false, count, datatype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } - res = NBC_Sched_recv (recvbuf, false, count, datatype, 0, schedule, false); + /* recv my data to the remote root */ + if (0 != rank || 1 ==(rsize%2)) { + res = NBC_Sched_recv (recvbuf, false, count, datatype, 0, schedule, false); + } else { + res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, 0, schedule, false); + } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } if (0 == rank) { - /* wait for data from the remote root */ + char *rbuf, *lbuf, *buf; + int tmprbuf, tmplbuf; + res = NBC_Sched_barrier (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + /* ensure the result ends up in recvbuf */ + if (0 == (rsize%2)) { + lbuf = (void *)(-gap); + tmplbuf = true; + rbuf = recvbuf; + tmprbuf = false; + } else { + rbuf = (void *)(-gap); + tmprbuf = true; + lbuf = recvbuf; + tmplbuf = false; + } + /* get data from remote peers and reduce */ for (int rpeer = 1 ; rpeer < rsize ; ++rpeer) { - res = NBC_Sched_recv (0, true, count, datatype, rpeer, schedule, true); + res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, rpeer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } - res = NBC_Sched_op (recvbuf, false, 0, true, recvbuf, false, count, datatype, op, - schedule, true); + res = NBC_Sched_op2 (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; + tmprbuf ^= 1; tmplbuf ^= 1; } /* exchange our result with the remote root (each root will broadcast to the other's peers) */ - res = NBC_Sched_recv (0, true, count, datatype, 0, schedule, false); + res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } @@ -671,7 +698,7 @@ static inline int allred_sched_linear(int rank, int rsize, const void *sendbuf, /* broadcast the result to all remote peers */ for (int rpeer = 1 ; rpeer < rsize ; ++rpeer) { - res = NBC_Sched_send (0, true, count, datatype, rpeer, schedule, false); + res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rpeer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } From 8583b583bc921139b389d57068d5d61d43ca673e Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Mon, 13 Jun 2016 17:29:07 +0900 Subject: [PATCH 10/14] coll/base: checkpoint --- ompi/mca/coll/base/coll_base_reduce.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c index 23d68387720..d3c4ea93395 100644 --- a/ompi/mca/coll/base/coll_base_reduce.c +++ b/ompi/mca/coll/base/coll_base_reduce.c @@ -602,7 +602,7 @@ ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, int count ptrdiff_t extent, dsize, gap; char *free_buffer = NULL; char *pml_buffer = NULL; - char *inplace_temp = NULL; + char *inplace_temp_free = NULL; char *inbuf; /* Initialize */ @@ -624,18 +624,18 @@ ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, int count if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; - inplace_temp = (char*)malloc(dsize); - if (NULL == inplace_temp) { + inplace_temp_free = (char*)malloc(dsize); + if (NULL == inplace_temp_free) { return OMPI_ERR_OUT_OF_RESOURCE; } - rbuf = inplace_temp - gap; + rbuf = inplace_temp_free - gap; } if (size > 1) { free_buffer = (char*)malloc(dsize); if (NULL == free_buffer) { - if (NULL != inplace_temp) { - free(inplace_temp); + if (NULL != inplace_temp_free) { + free(inplace_temp_free); } return OMPI_ERR_OUT_OF_RESOURCE; } @@ -682,9 +682,9 @@ ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, int count ompi_op_reduce(op, inbuf, rbuf, count, dtype); } - if (NULL != inplace_temp) { - err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp); - free(inplace_temp); + if (NULL != inplace_temp_free) { + err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, rbuf); + free(inplace_temp_free); } if (NULL != free_buffer) { free(free_buffer); From 4376cf2c70b540921309bda19cd4814470b2edab Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Mon, 13 Jun 2016 17:29:22 +0900 Subject: [PATCH 11/14] coll/basic: checkpoint --- ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c index fca39e5d51a..bb157cd25e3 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c +++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c @@ -58,7 +58,7 @@ mca_coll_basic_reduce_scatter_block_intra(const void *sbuf, void *rbuf, int rcou mca_coll_base_module_t *module) { int rank, size, count, err = OMPI_SUCCESS; - ptrdiff_t extent, buf_size, gap; + ptrdiff_t gap, span; char *recv_buf = NULL, *recv_buf_free = NULL; /* Initialize */ @@ -72,8 +72,7 @@ mca_coll_basic_reduce_scatter_block_intra(const void *sbuf, void *rbuf, int rcou } /* get datatype information */ - ompi_datatype_type_extent(dtype, &extent); - buf_size = opal_datatype_span(&dtype->super, count, &gap); + span = opal_datatype_span(&dtype->super, count, &gap); /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { @@ -83,12 +82,12 @@ mca_coll_basic_reduce_scatter_block_intra(const void *sbuf, void *rbuf, int rcou if (0 == rank) { /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ - recv_buf_free = (char*) malloc(buf_size); - recv_buf = recv_buf_free - gap; + recv_buf_free = (char*) malloc(span); if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } + recv_buf = recv_buf_free - gap; } /* reduction */ From 44e84a667ced77dd58c8c4259777860f66b3a22e Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Mon, 13 Jun 2016 17:29:43 +0900 Subject: [PATCH 12/14] coll/inter: checkpoint --- ompi/mca/coll/inter/coll_inter_allreduce.c | 13 +++++-------- ompi/mca/coll/inter/coll_inter_reduce.c | 16 ++++++++-------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/ompi/mca/coll/inter/coll_inter_allreduce.c b/ompi/mca/coll/inter/coll_inter_allreduce.c index 152fd2467c1..06868c70cd2 100644 --- a/ompi/mca/coll/inter/coll_inter_allreduce.c +++ b/ompi/mca/coll/inter/coll_inter_allreduce.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006-2007 University of Houston. All rights reserved. * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -48,23 +48,20 @@ mca_coll_inter_allreduce_inter(const void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { int err, rank, root = 0; - ptrdiff_t lb, extent; char *tmpbuf = NULL, *pml_buffer = NULL; ompi_request_t *req[2]; + ptrdiff_t gap, span; rank = ompi_comm_rank(comm); /* Perform the reduction locally */ - err = ompi_datatype_get_extent(dtype, &lb, &extent); - if (OMPI_SUCCESS != err) { - return OMPI_ERROR; - } + span = opal_datatype_span(&dtype->super, count, &gap); - tmpbuf = (char *) malloc(count * extent); + tmpbuf = (char *) malloc(span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } - pml_buffer = tmpbuf - lb; + pml_buffer = tmpbuf - gap; err = comm->c_local_comm->c_coll.coll_reduce(sbuf, pml_buffer, count, dtype, op, root, diff --git a/ompi/mca/coll/inter/coll_inter_reduce.c b/ompi/mca/coll/inter/coll_inter_reduce.c index 14085ec2432..d93ede480db 100644 --- a/ompi/mca/coll/inter/coll_inter_reduce.c +++ b/ompi/mca/coll/inter/coll_inter_reduce.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006-2007 University of Houston. All rights reserved. * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -47,9 +47,6 @@ mca_coll_inter_reduce_inter(const void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { int rank, err; - ptrdiff_t true_lb, true_extent, lb, extent; - char *free_buffer = NULL; - char *pml_buffer = NULL; /* Initialize */ rank = ompi_comm_rank(comm); @@ -58,15 +55,18 @@ mca_coll_inter_reduce_inter(const void *sbuf, void *rbuf, int count, /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { + ptrdiff_t gap, span; + char *free_buffer = NULL; + char *pml_buffer = NULL; + /* Perform the reduce locally with the first process as root */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + span = opal_datatype_span(&dtype->super, count, &gap); - free_buffer = (char*)malloc(true_extent + (count - 1) * extent); + free_buffer = (char*)malloc(span); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - pml_buffer = free_buffer - true_lb; + pml_buffer = free_buffer - gap; err = comm->c_local_comm->c_coll.coll_reduce(sbuf, pml_buffer, count, dtype, op, 0, comm->c_local_comm, From 527dadf38ac207714a5d27374872159d10c63d40 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Mon, 13 Jun 2016 17:29:52 +0900 Subject: [PATCH 13/14] coll/libnbc: checkpoint --- ompi/mca/coll/libnbc/iscan.diff | 78 +++++++++++++++++++ ompi/mca/coll/libnbc/nbc_iexscan.c | 23 +++--- ompi/mca/coll/libnbc/nbc_ireduce.c | 43 ++++++---- ompi/mca/coll/libnbc/nbc_ireduce_scatter.c | 8 +- .../coll/libnbc/nbc_ireduce_scatter_block.c | 16 ++-- ompi/mca/coll/libnbc/nbc_iscan.c | 31 ++++---- 6 files changed, 143 insertions(+), 56 deletions(-) create mode 100644 ompi/mca/coll/libnbc/iscan.diff diff --git a/ompi/mca/coll/libnbc/iscan.diff b/ompi/mca/coll/libnbc/iscan.diff new file mode 100644 index 00000000000..6b39ab86e97 --- /dev/null +++ b/ompi/mca/coll/libnbc/iscan.diff @@ -0,0 +1,78 @@ +diff --git a/ompi/mca/coll/libnbc/nbc_iscan.c b/ompi/mca/coll/libnbc/nbc_iscan.c +index a239d14..2179260 100644 +--- a/ompi/mca/coll/libnbc/nbc_iscan.c ++++ b/ompi/mca/coll/libnbc/nbc_iscan.c +@@ -36,16 +36,16 @@ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { + + /* linear iscan + * working principle: +- * 1. each node (but node 0) receives from left neigbor ++ * 1. each node (but node 0) receives from left neighbor + * 2. performs op +- * 3. all but rank p-1 do sends to it's right neigbor and exits ++ * 3. all but rank p-1 do sends to it's right neighbor and exits + * + */ + int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, + struct ompi_communicator_t *comm, ompi_request_t ** request, + struct mca_coll_base_module_2_1_0_t *module) { + int rank, p, res; +- MPI_Aint ext; ++ ptrdiff_t gap, span; + NBC_Schedule *schedule; + char inplace; + NBC_Handle *handle; +@@ -56,13 +56,7 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da + rank = ompi_comm_rank (comm); + p = ompi_comm_size (comm); + +- res = ompi_datatype_type_extent (datatype, &ext); +- if (MPI_SUCCESS != res) { +- NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); +- return res; +- } +- +- if ((rank == 0) && !inplace) { ++ if (!inplace) { + /* copy data to receivebuf */ + res = NBC_Copy (sendbuf, count, datatype, recvbuf, count, datatype, comm); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { +@@ -75,12 +69,6 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da + return res; + } + +- handle->tmpbuf = malloc (ext * count); +- if (NULL == handle->tmpbuf) { +- NBC_Return_handle (handle); +- return OMPI_ERR_OUT_OF_RESOURCE; +- } +- + #ifdef NBC_CACHE_SCHEDULE + NBC_Scan_args *args, *found, search; + +@@ -103,8 +91,15 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da + handle->schedule = schedule; + + if(rank != 0) { ++ span = opal_datatype_span(&datatype->super, count, &gap); ++ handle->tmpbuf = malloc (span); ++ if (NULL == handle->tmpbuf) { ++ NBC_Return_handle (handle); ++ return OMPI_ERR_OUT_OF_RESOURCE; ++ } ++ + /* we have to wait until we have the data */ +- res = NBC_Sched_recv (0, true, count, datatype, rank-1, schedule, true); ++ res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, rank-1, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + NBC_Return_handle (handle); + return res; +@@ -112,7 +107,7 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da + + /* perform the reduce in my local buffer */ + /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ +- res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, ++ res = NBC_Sched_op2 ((void *)(-gap), true, recvbuf, false, count, datatype, op, schedule, + true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + NBC_Return_handle (handle); diff --git a/ompi/mca/coll/libnbc/nbc_iexscan.c b/ompi/mca/coll/libnbc/nbc_iexscan.c index 736f16c50be..c922c610944 100644 --- a/ompi/mca/coll/libnbc/nbc_iexscan.c +++ b/ompi/mca/coll/libnbc/nbc_iexscan.c @@ -45,7 +45,7 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_ struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, p, res; - MPI_Aint ext; + ptrdiff_t gap, span; NBC_Schedule *schedule; #ifdef NBC_CACHE_SCHEDULE NBC_Scan_args *args, *found, search; @@ -59,22 +59,17 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_ rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); - res = ompi_datatype_type_extent(datatype, &ext); - if (MPI_SUCCESS != res) { - NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); - return res; - } - res = NBC_Init_handle(comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + span = opal_datatype_span(&datatype->super, count, &gap); if (inplace && rank < p - 1) { /* need more buffer space for the inplace case */ - handle->tmpbuf = malloc(ext * count * 2); + handle->tmpbuf = malloc(2 * span); } else { - handle->tmpbuf = malloc(ext * count); + handle->tmpbuf = malloc(span); } if (handle->tmpbuf == NULL) { @@ -105,7 +100,7 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_ if (inplace && rank < p - 1) { /* if sendbuf == recvbuf do not clobber the send buffer until it has been combined * with the incoming data. */ - res = NBC_Sched_recv ((void *) (ext * count), true, count, datatype, rank-1, schedule, false); + res = NBC_Sched_recv ((void *) (span - gap), true, count, datatype, rank-1, schedule, false); } else { res = NBC_Sched_recv (recvbuf, false, count, datatype, rank-1, schedule, false); } @@ -126,10 +121,10 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_ /* perform the reduce in my temporary buffer */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after */ if (inplace) { - res = NBC_Sched_op (0, true, sendbuf, false, (void *)(ext * count), true, count, + res = NBC_Sched_op ((void *)(-gap), true, sendbuf, false, (void *)(span - gap), true, count, datatype, op, schedule, true); } else { - res = NBC_Sched_op (0, true, sendbuf, false, recvbuf, false, count, datatype, op, + res = NBC_Sched_op ((void *)(-gap), true, sendbuf, false, recvbuf, false, count, datatype, op, schedule, true); } @@ -139,7 +134,7 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_ } /* send reduced data onward */ - res = NBC_Sched_send (0, true, count, datatype, rank + 1, schedule, false); + res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rank + 1, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; @@ -147,7 +142,7 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_ if (inplace) { /* copy the received data into the receive buffer */ - res = NBC_Sched_copy ((void *)(ext * count), true, count, datatype, recvbuf, + res = NBC_Sched_copy ((void *)(span - gap), true, count, datatype, recvbuf, false, count, datatype, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); diff --git a/ompi/mca/coll/libnbc/nbc_ireduce.c b/ompi/mca/coll/libnbc/nbc_ireduce.c index 955ed1dbf7f..cdddd1705ef 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce.c @@ -23,7 +23,7 @@ static inline int red_sched_binomial (int rank, int p, int root, const void *sen static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize); -static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, +static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, void *tmpbuf, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle); #ifdef NBC_CACHE_SCHEDULE @@ -208,25 +208,20 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count struct mca_coll_base_module_2_1_0_t *module) { int rank, res, rsize; NBC_Schedule *schedule; - MPI_Aint ext; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; rank = ompi_comm_rank (comm); rsize = ompi_comm_remote_size (comm); - res = ompi_datatype_type_extent (datatype, &ext); - if (MPI_SUCCESS != res) { - NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); - return res; - } - res = NBC_Init_handle(comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } - handle->tmpbuf = malloc (ext * count); + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; @@ -238,7 +233,7 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count return OMPI_ERR_OUT_OF_RESOURCE; } - res = red_sched_linear (rank, rsize, root, sendbuf, recvbuf, count, datatype, op, schedule, handle); + res = red_sched_linear (rank, rsize, root, sendbuf, recvbuf, (void *)(-gap), count, datatype, op, schedule, handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; @@ -461,33 +456,51 @@ static inline int red_sched_chain (int rank, int p, int root, const void *sendbu } /* simple linear algorithm for intercommunicators */ -static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, +static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, void *tmpbuf, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { int res; + char *rbuf, *lbuf, *buf; + int tmprbuf, tmplbuf; if (0 == count) { return OMPI_SUCCESS; } if (MPI_ROOT == root) { - res = NBC_Sched_recv (recvbuf, false, count, datatype, 0, schedule, true); + /* ensure the result ends up in recvbuf */ + if (0 == (rsize%2)) { + lbuf = tmpbuf; + tmplbuf = true; + rbuf = recvbuf; + tmprbuf = false; + } else { + rbuf = tmpbuf; + tmprbuf = true; + lbuf = recvbuf; + tmplbuf = false; + } + + res = NBC_Sched_recv (lbuf, tmplbuf, count, datatype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } for (int peer = 1 ; peer < rsize ; ++peer) { - res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); + res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } - res = NBC_Sched_op (recvbuf, false, 0, true, recvbuf, false, count, datatype, op, schedule, true); + res = NBC_Sched_op2 (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; + tmprbuf ^= 1; tmplbuf ^= 1; } } else if (MPI_PROC_NULL != root) { - res = NBC_Sched_send (sendbuf, false, count, datatype, root, schedule, false); + res = NBC_Sched_send (sendbuf, false, count, datatype, root, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c index 902061dd55f..58a775f5e43 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c @@ -40,6 +40,7 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i struct mca_coll_base_module_2_1_0_t *module) { int peer, rank, maxr, p, res, count; MPI_Aint ext; + ptrdiff_t gap, span; char *sbuf, inplace; NBC_Schedule *schedule; NBC_Handle *handle; @@ -82,14 +83,15 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i maxr = (int) ceil ((log((double) p) / LOG2)); - handle->tmpbuf = malloc (ext * count * 2); + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span * 2); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } - rbuf = 0; - lbuf = (char *)(ext*count); + rbuf = (char *)(-gap); + lbuf = (char *)(span - gap); schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c index 3244dc77209..17578d3394e 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c @@ -38,6 +38,7 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i struct mca_coll_base_module_2_1_0_t *module) { int peer, rank, maxr, p, res, count; MPI_Aint ext; + ptrdiff_t gap, span; char *redbuf, *sbuf, inplace; NBC_Schedule *schedule; NBC_Handle *handle; @@ -75,16 +76,17 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i if (0 < count) { char *rbuf, *lbuf, *buf; - handle->tmpbuf = malloc (ext*count*2); + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (2*span); if (NULL == handle->tmpbuf) { OMPI_COLL_LIBNBC_REQUEST_RETURN(handle); OBJ_RELEASE(schedule); return OMPI_ERR_OUT_OF_RESOURCE; } - rbuf = 0; - lbuf = (char *)(ext*count); - redbuf = (char *) handle->tmpbuf + ext * count; + rbuf = (void *)(-gap); + lbuf = (char *)(span - gap); + redbuf = (char *) handle->tmpbuf + span - gap; /* copy data to redbuf if we only have a single node */ if ((p == 1) && !inplace) { @@ -170,8 +172,10 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i } } - res = NBC_Sched_copy (lbuf, true, recvcount, datatype, recvbuf, false, recvcount, - datatype, schedule, false); + if ((p != 1) || !inplace) { + res = NBC_Sched_copy (lbuf, true, recvcount, datatype, recvbuf, false, recvcount, + datatype, schedule, false); + } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; diff --git a/ompi/mca/coll/libnbc/nbc_iscan.c b/ompi/mca/coll/libnbc/nbc_iscan.c index a239d14ed10..2179260c048 100644 --- a/ompi/mca/coll/libnbc/nbc_iscan.c +++ b/ompi/mca/coll/libnbc/nbc_iscan.c @@ -36,16 +36,16 @@ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { /* linear iscan * working principle: - * 1. each node (but node 0) receives from left neigbor + * 1. each node (but node 0) receives from left neighbor * 2. performs op - * 3. all but rank p-1 do sends to it's right neigbor and exits + * 3. all but rank p-1 do sends to it's right neighbor and exits * */ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, p, res; - MPI_Aint ext; + ptrdiff_t gap, span; NBC_Schedule *schedule; char inplace; NBC_Handle *handle; @@ -56,13 +56,7 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); - res = ompi_datatype_type_extent (datatype, &ext); - if (MPI_SUCCESS != res) { - NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); - return res; - } - - if ((rank == 0) && !inplace) { + if (!inplace) { /* copy data to receivebuf */ res = NBC_Copy (sendbuf, count, datatype, recvbuf, count, datatype, comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { @@ -75,12 +69,6 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da return res; } - handle->tmpbuf = malloc (ext * count); - if (NULL == handle->tmpbuf) { - NBC_Return_handle (handle); - return OMPI_ERR_OUT_OF_RESOURCE; - } - #ifdef NBC_CACHE_SCHEDULE NBC_Scan_args *args, *found, search; @@ -103,8 +91,15 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da handle->schedule = schedule; if(rank != 0) { + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span); + if (NULL == handle->tmpbuf) { + NBC_Return_handle (handle); + return OMPI_ERR_OUT_OF_RESOURCE; + } + /* we have to wait until we have the data */ - res = NBC_Sched_recv (0, true, count, datatype, rank-1, schedule, true); + res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, rank-1, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; @@ -112,7 +107,7 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da /* perform the reduce in my local buffer */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ - res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, + res = NBC_Sched_op2 ((void *)(-gap), true, recvbuf, false, count, datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); From 0bc3bf9144bef41106a528f51ad09aee58decd51 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 15 Jun 2016 11:49:41 +0900 Subject: [PATCH 14/14] checkpoint --- .../coll/basic/coll_basic_reduce_scatter.c | 29 ++++++++++-------- .../basic/coll_basic_reduce_scatter_block.c | 27 +++++++++-------- ompi/mca/coll/libnbc/nbc_ireduce_scatter.c | 24 ++++++++++----- .../coll/libnbc/nbc_ireduce_scatter_block.c | 30 ++++++++++++------- 4 files changed, 66 insertions(+), 44 deletions(-) diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter.c index d8e9cc8a0db..8fa4c129cef 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce_scatter.c +++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter.c @@ -14,7 +14,7 @@ * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -367,8 +367,9 @@ mca_coll_basic_reduce_scatter_inter(const void *sbuf, void *rbuf, const int *rco { int err, i, rank, root = 0, rsize, lsize; int totalcounts; - ptrdiff_t lb, extent; + ptrdiff_t gap, span; char *tmpbuf = NULL, *tmpbuf2 = NULL; + char *lbuf, *buf; ompi_request_t *req; int *disps = NULL; @@ -399,10 +400,7 @@ mca_coll_basic_reduce_scatter_inter(const void *sbuf, void *rbuf, const int *rco * its size is the same as the local communicator size. */ if (rank == root) { - err = ompi_datatype_get_extent(dtype, &lb, &extent); - if (OMPI_SUCCESS != err) { - return OMPI_ERROR; - } + span = opal_datatype_span(&dtype->super, totalcounts, &gap); /* Generate displacements for the scatterv part */ disps = (int*) malloc(sizeof(int) * lsize); @@ -414,12 +412,14 @@ mca_coll_basic_reduce_scatter_inter(const void *sbuf, void *rbuf, const int *rco disps[i + 1] = disps[i] + rcounts[i]; } - tmpbuf = (char *) malloc(totalcounts * extent); - tmpbuf2 = (char *) malloc(totalcounts * extent); + tmpbuf = (char *) malloc(span); + tmpbuf2 = (char *) malloc(span); if (NULL == tmpbuf || NULL == tmpbuf2) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } + lbuf = tmpbuf - gap; + buf = tmpbuf2 - gap; /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(isend(sbuf, totalcounts, dtype, 0, @@ -429,7 +429,7 @@ mca_coll_basic_reduce_scatter_inter(const void *sbuf, void *rbuf, const int *rco goto exit; } - err = MCA_PML_CALL(recv(tmpbuf2, totalcounts, dtype, 0, + err = MCA_PML_CALL(recv(lbuf, totalcounts, dtype, 0, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { @@ -444,10 +444,11 @@ mca_coll_basic_reduce_scatter_inter(const void *sbuf, void *rbuf, const int *rco /* Loop receiving and calling reduction function (C or Fortran) * The result of this reduction operations is then in - * tmpbuf2. + * lbuf. */ for (i = 1; i < rsize; i++) { - err = MCA_PML_CALL(recv(tmpbuf, totalcounts, dtype, i, + char *tbuf; + err = MCA_PML_CALL(recv(buf, totalcounts, dtype, i, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { @@ -455,7 +456,9 @@ mca_coll_basic_reduce_scatter_inter(const void *sbuf, void *rbuf, const int *rco } /* Perform the reduction */ - ompi_op_reduce(op, tmpbuf, tmpbuf2, totalcounts, dtype); + ompi_op_reduce(op, lbuf, buf, totalcounts, dtype); + /* swap the buffers */ + tbuf = lbuf; lbuf = buf; buf = tbuf; } } else { /* If not root, send data to the root. */ @@ -468,7 +471,7 @@ mca_coll_basic_reduce_scatter_inter(const void *sbuf, void *rbuf, const int *rco } /* Now do a scatterv on the local communicator */ - err = comm->c_local_comm->c_coll.coll_scatterv(tmpbuf2, rcounts, disps, dtype, + err = comm->c_local_comm->c_coll.coll_scatterv(lbuf, rcounts, disps, dtype, rbuf, rcounts[rank], dtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_scatterv_module); diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c index bb157cd25e3..ae4eedab884 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c +++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c @@ -12,7 +12,7 @@ * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -125,8 +125,9 @@ mca_coll_basic_reduce_scatter_block_inter(const void *sbuf, void *rbuf, int rcou { int err, i, rank, root = 0, rsize, lsize; int totalcounts; - ptrdiff_t lb, extent; + ptrdiff_t gap, span; char *tmpbuf = NULL, *tmpbuf2 = NULL; + char *lbuf, *buf; ompi_request_t *req; rank = ompi_comm_rank(comm); @@ -150,16 +151,15 @@ mca_coll_basic_reduce_scatter_block_inter(const void *sbuf, void *rbuf, int rcou * */ if (rank == root) { - err = ompi_datatype_get_extent(dtype, &lb, &extent); - if (OMPI_SUCCESS != err) { - return OMPI_ERROR; - } + span = opal_datatype_span(&dtype->super, totalcounts, &gap); - tmpbuf = (char *) malloc(totalcounts * extent); - tmpbuf2 = (char *) malloc(totalcounts * extent); + tmpbuf = (char *) malloc(span); + tmpbuf2 = (char *) malloc(span); if (NULL == tmpbuf || NULL == tmpbuf2) { return OMPI_ERR_OUT_OF_RESOURCE; } + lbuf = tmpbuf - gap; + buf = tmpbuf2 - gap; /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(isend(sbuf, totalcounts, dtype, 0, @@ -169,7 +169,7 @@ mca_coll_basic_reduce_scatter_block_inter(const void *sbuf, void *rbuf, int rcou goto exit; } - err = MCA_PML_CALL(recv(tmpbuf2, totalcounts, dtype, 0, + err = MCA_PML_CALL(recv(lbuf, totalcounts, dtype, 0, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { @@ -187,7 +187,8 @@ mca_coll_basic_reduce_scatter_block_inter(const void *sbuf, void *rbuf, int rcou * tmpbuf2. */ for (i = 1; i < rsize; i++) { - err = MCA_PML_CALL(recv(tmpbuf, totalcounts, dtype, i, + char *tbuf; + err = MCA_PML_CALL(recv(buf, totalcounts, dtype, i, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { @@ -195,7 +196,9 @@ mca_coll_basic_reduce_scatter_block_inter(const void *sbuf, void *rbuf, int rcou } /* Perform the reduction */ - ompi_op_reduce(op, tmpbuf, tmpbuf2, totalcounts, dtype); + ompi_op_reduce(op, lbuf, buf, totalcounts, dtype); + /* swap the buffers */ + tbuf = lbuf; lbuf = buf; buf = tbuf; } } else { /* If not root, send data to the root. */ @@ -208,7 +211,7 @@ mca_coll_basic_reduce_scatter_block_inter(const void *sbuf, void *rbuf, int rcou } /* Now do a scatterv on the local communicator */ - err = comm->c_local_comm->c_coll.coll_scatter(tmpbuf2, rcount, dtype, + err = comm->c_local_comm->c_coll.coll_scatter(lbuf, rcount, dtype, rbuf, rcount, dtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_scatter_module); diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c index 58a775f5e43..31daa80bf39 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c @@ -205,6 +205,7 @@ int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf, struct mca_coll_base_module_2_1_0_t *module) { int rank, res, count, rsize; MPI_Aint ext; + ptrdiff_t gap, span; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; @@ -223,13 +224,15 @@ int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf, count += recvcounts[r]; } + span = opal_datatype_span(&datatype->super, count, &gap); + res = NBC_Init_handle(comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } if (count > 0) { - handle->tmpbuf = malloc (2 * ext * count); + handle->tmpbuf = malloc (2 * span); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; @@ -253,43 +256,48 @@ int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf, } if (0 == rank) { - res = NBC_Sched_recv ((void *) 0, true, count, datatype, 0, schedule, true); + char *lbuf, *rbuf; + lbuf = (char *)(-gap); + rbuf = (char *)(span-gap); + res = NBC_Sched_recv (lbuf, true, count, datatype, 0, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } for (int peer = 1 ; peer < rsize ; ++peer) { - res = NBC_Sched_recv ((void *)(ext * count), true, count, datatype, peer, schedule, true); + char *tbuf; + res = NBC_Sched_recv (rbuf, true, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } - res = NBC_Sched_op ((void *) 0, true, (void *)(ext * count), true, (void *) 0, true, count, datatype, + res = NBC_Sched_op2 (lbuf, true, rbuf, true, count, datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } + tbuf = lbuf; lbuf = rbuf; rbuf = tbuf; } /* exchange data with remote root for scatter phase (we *could* use the local communicator to do the scatter) */ - res = NBC_Sched_recv ((void *)(ext * count), true, count, datatype, 0, schedule, false); + res = NBC_Sched_recv (rbuf, true, count, datatype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } - res = NBC_Sched_send ((void *) 0, true, count, datatype, 0, schedule, true); + res = NBC_Sched_send (lbuf, true, count, datatype, 0, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } /* scatter */ - for (int peer = 0, offset = ext * count ; peer < rsize ; ++peer) { - res = NBC_Sched_send ((void *)(uintptr_t) offset, true, recvcounts[peer], datatype, peer, schedule, + for (int peer = 0, offset = 0 ; peer < rsize ; ++peer) { + res = NBC_Sched_send (rbuf + offset, true, recvcounts[peer], datatype, peer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c index 17578d3394e..6b7b337ab90 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c @@ -201,11 +201,12 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i return OMPI_SUCCESS; } -int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sbuf, void *rbuf, int rcount, struct ompi_datatype_t *dtype, +int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sendbuf, void *recvbuf, int rcount, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, ompi_request_t **request, struct mca_coll_base_module_2_1_0_t *module) { int rank, res, count, rsize; MPI_Aint ext; + ptrdiff_t gap, span; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; @@ -226,8 +227,10 @@ int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sbuf, void *rbuf, i count = rcount * rsize; + span = opal_datatype_span(&dtype->super, count, &gap); + if (count > 0) { - handle->tmpbuf = malloc (2 * ext * count); + handle->tmpbuf = malloc (2 * span); if (NULL == handle->tmpbuf) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; @@ -244,42 +247,47 @@ int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sbuf, void *rbuf, i handle->schedule = schedule; /* send my data to the remote root */ - res = NBC_Sched_send (sbuf, false, count, dtype, 0, schedule, false); + res = NBC_Sched_send (sendbuf, false, count, dtype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } if (0 == rank) { - res = NBC_Sched_recv ((void *) 0, true, count, dtype, 0, schedule, true); + char *lbuf, *rbuf; + lbuf = (char *)(-gap); + rbuf = (char *)(span-gap); + res = NBC_Sched_recv (lbuf, true, count, dtype, 0, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } for (int peer = 1 ; peer < rsize ; ++peer) { - res = NBC_Sched_recv ((void *)(ext * count), true, count, dtype, peer, schedule, true); + char *tbuf; + res = NBC_Sched_recv (rbuf, true, count, dtype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } - res = NBC_Sched_op ((void *) 0, true, (void *)(ext * count), true, (void *) 0, true, count, dtype, op, - schedule, true); + res = NBC_Sched_op2 (lbuf, true, rbuf, true, count, dtype, + op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } + tbuf = lbuf; lbuf = rbuf; rbuf = tbuf; } /* exchange data with remote root for scatter phase (we *could* use the local communicator to do the scatter) */ - res = NBC_Sched_recv ((void *)(ext * count), true, count, dtype, 0, schedule, false); + res = NBC_Sched_recv (rbuf, true, count, dtype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } - res = NBC_Sched_send ((void *) 0, true, count, dtype, 0, schedule, true); + res = NBC_Sched_send (lbuf, true, count, dtype, 0, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; @@ -287,7 +295,7 @@ int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sbuf, void *rbuf, i /* scatter */ for (int peer = 0 ; peer < rsize ; ++peer) { - res = NBC_Sched_send ((void *)(ext * (count + peer * rcount)), true, rcount, dtype, peer, schedule, false); + res = NBC_Sched_send (rbuf + ext * rcount * peer, true, rcount, dtype, peer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; @@ -296,7 +304,7 @@ int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sbuf, void *rbuf, i } /* receive my block */ - res = NBC_Sched_recv(rbuf, true, rcount, dtype, 0, schedule, false); + res = NBC_Sched_recv(recvbuf, false, rcount, dtype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res;