From 682557af36b9f76382d79e01af7d75ce4912e848 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 21 Oct 2015 00:55:28 -0400 Subject: [PATCH 01/11] Cleanup the memory handling for temporary buffers in some of the collective modules. Added a new function opan_datatype_span, to compute the memory span of count number of datatype, excluding the gaps in the beginning and at the end. If a memory allocation is made using the returned value, the gap (also returned) should be removed from the allocated pointer. (back-ported from commit open-mpi/ompi@4d00c59b2ebd7a97dfc7d366d0beed486e5cacd0) --- ompi/mca/coll/base/README.memory_management | 124 ++++++++++++++++++ ompi/mca/coll/basic/coll_basic_allreduce.c | 16 +-- ompi/mca/coll/basic/coll_basic_alltoallw.c | 23 ++-- ompi/mca/coll/basic/coll_basic_exscan.c | 11 +- ompi/mca/coll/basic/coll_basic_reduce.c | 25 ++-- .../coll/basic/coll_basic_reduce_scatter.c | 13 +- .../basic/coll_basic_reduce_scatter_block.c | 9 +- ompi/mca/coll/basic/coll_basic_scan.c | 9 +- ompi/mca/coll/cuda/coll_cuda_allreduce.c | 15 +-- ompi/mca/coll/cuda/coll_cuda_exscan.c | 15 +-- ompi/mca/coll/cuda/coll_cuda_reduce.c | 16 +-- .../cuda/coll_cuda_reduce_scatter_block.c | 18 +-- ompi/mca/coll/cuda/coll_cuda_scan.c | 15 +-- ompi/mca/coll/sm/coll_sm_reduce.c | 29 ++-- ompi/mca/coll/tuned/coll_tuned_allgather.c | 11 +- ompi/mca/coll/tuned/coll_tuned_allreduce.c | 24 ++-- ompi/mca/coll/tuned/coll_tuned_alltoall.c | 31 ++--- ompi/mca/coll/tuned/coll_tuned_alltoallv.c | 7 +- ompi/mca/coll/tuned/coll_tuned_gather.c | 20 +-- ompi/mca/coll/tuned/coll_tuned_reduce.c | 50 ++++--- .../coll/tuned/coll_tuned_reduce_scatter.c | 41 +++--- ompi/mca/coll/tuned/coll_tuned_scatter.c | 24 ++-- opal/datatype/opal_datatype.h | 21 ++- 23 files changed, 340 insertions(+), 227 deletions(-) create mode 100644 ompi/mca/coll/base/README.memory_management diff --git a/ompi/mca/coll/base/README.memory_management b/ompi/mca/coll/base/README.memory_management new file mode 100644 index 0000000000..1e34f577c1 --- /dev/null +++ b/ompi/mca/coll/base/README.memory_management @@ -0,0 +1,124 @@ + /* This comment applies to all collectives (including the basic + * module) where we allocate a temporary buffer. For the next few + * lines of code, it's tremendously complicated how we decided that + * this was the Right Thing to do. Sit back and enjoy. And prepare + * to have your mind warped. :-) + * + * Recall some definitions (I always get these backwards, so I'm + * going to put them here): + * + * extent: the length from the lower bound to the upper bound -- may + * be considerably larger than the buffer required to hold the data + * (or smaller! But it's easiest to think about when it's larger). + * + * true extent: the exact number of bytes required to hold the data + * in the layout pattern in the datatype. + * + * For example, consider the following buffer (just talking about + * true_lb, extent, and true extent -- extrapolate for true_ub: + * + * A B C + * -------------------------------------------------------- + * | | | + * -------------------------------------------------------- + * + * There are multiple cases: + * + * 1. A is what we give to MPI_Send (and friends), and A is where + * the data starts, and C is where the data ends. In this case: + * + * - extent: C-A + * - true extent: C-A + * - true_lb: 0 + * + * A C + * -------------------------------------------------------- + * | | + * -------------------------------------------------------- + * <=======================extent=========================> + * <======================true extent=====================> + * + * 2. A is what we give to MPI_Send (and friends), B is where the + * data starts, and C is where the data ends. In this case: + * + * - extent: C-A + * - true extent: C-B + * - true_lb: positive + * + * A B C + * -------------------------------------------------------- + * | | User buffer | + * -------------------------------------------------------- + * <=======================extent=========================> + * <===============true extent=============> + * + * 3. B is what we give to MPI_Send (and friends), A is where the + * data starts, and C is where the data ends. In this case: + * + * - extent: C-A + * - true extent: C-A + * - true_lb: negative + * + * A B C + * -------------------------------------------------------- + * | | User buffer | + * -------------------------------------------------------- + * <=======================extent=========================> + * <======================true extent=====================> + * + * 4. MPI_BOTTOM is what we give to MPI_Send (and friends), B is + * where the data starts, and C is where the data ends. In this + * case: + * + * - extent: C-MPI_BOTTOM + * - true extent: C-B + * - true_lb: [potentially very large] positive + * + * MPI_BOTTOM B C + * -------------------------------------------------------- + * | | User buffer | + * -------------------------------------------------------- + * <=======================extent=========================> + * <===============true extent=============> + * + * So in all cases, for a temporary buffer, all we need to malloc() + * is a buffer of size true_extent. We therefore need to know two + * pointer values: what value to give to MPI_Send (and friends) and + * what value to give to free(), because they might not be the same. + * + * Clearly, what we give to free() is exactly what was returned from + * malloc(). That part is easy. :-) + * + * What we give to MPI_Send (and friends) is a bit more complicated. + * Let's take the 4 cases from above: + * + * 1. If A is what we give to MPI_Send and A is where the data + * starts, then clearly we give to MPI_Send what we got back from + * malloc(). + * + * 2. If B is what we get back from malloc, but we give A to + * MPI_Send, then the buffer range [A,B) represents "dead space" + * -- no data will be put there. So it's safe to give B-true_lb to + * MPI_Send. More specifically, the true_lb is positive, so B-true_lb is + * actually A. + * + * 3. If A is what we get back from malloc, and B is what we give to + * MPI_Send, then the true_lb is negative, so A-true_lb will actually equal + * B. + * + * 4. Although this seems like the weirdest case, it's actually + * quite similar to case #2 -- the pointer we give to MPI_Send is + * smaller than the pointer we got back from malloc(). + * + * Hence, in all cases, we give (return_from_malloc - true_lb) to MPI_Send. + * + * This works fine and dandy if we only have (count==1), which we + * rarely do. ;-) So we really need to allocate (true_extent + + * ((count - 1) * extent)) to get enough space for the rest. This may + * be more than is necessary, but it's ok. + * + * Simple, no? :-) + * + */ + + diff --git a/ompi/mca/coll/basic/coll_basic_allreduce.c b/ompi/mca/coll/basic/coll_basic_allreduce.c index 9dc665124e..dbd8027395 100644 --- a/ompi/mca/coll/basic/coll_basic_allreduce.c +++ b/ompi/mca/coll/basic/coll_basic_allreduce.c @@ -78,8 +78,8 @@ mca_coll_basic_allreduce_inter(void *sbuf, void *rbuf, int count, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int err, i, rank, root = 0, rsize; - ptrdiff_t lb, extent; + int err, i, rank, root = 0, rsize, line; + ptrdiff_t extent, dsize, gap; char *tmpbuf = NULL, *pml_buffer = NULL; ompi_request_t *req[2]; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; @@ -98,16 +98,14 @@ mca_coll_basic_allreduce_inter(void *sbuf, void *rbuf, int count, * simultaniously. */ /*****************************************************************/ if (rank == root) { - err = ompi_datatype_get_extent(dtype, &lb, &extent); + err = ompi_datatype_type_extent(dtype, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } - - tmpbuf = (char *) malloc(count * extent); - if (NULL == tmpbuf) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - pml_buffer = tmpbuf - lb; + dsize = opal_datatype_span(&dtype->super, count, &gap); + tmpbuf = (char *) malloc(dsize); + if (NULL == tmpbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto exit; } + pml_buffer = tmpbuf - gap; /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(irecv(rbuf, count, dtype, 0, diff --git a/ompi/mca/coll/basic/coll_basic_alltoallw.c b/ompi/mca/coll/basic/coll_basic_alltoallw.c index 9f85da0927..b9bcf807a7 100644 --- a/ompi/mca/coll/basic/coll_basic_alltoallw.c +++ b/ompi/mca/coll/basic/coll_basic_alltoallw.c @@ -14,7 +14,7 @@ * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013 FUJITSU LIMITED. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -42,10 +42,10 @@ mca_coll_basic_alltoallw_intra_inplace(void *rbuf, int *rcounts, const int *rdis mca_coll_base_module_t *module) { mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; - int i, j, size, rank, err=MPI_SUCCESS, max_size; - MPI_Request *preq; - char *tmp_buffer; - ptrdiff_t ext; + int i, j, size, rank, err = MPI_SUCCESS, max_size; + MPI_Request *preq, *reqs = NULL; + char *tmp_buffer, *save_buffer = NULL; + ptrdiff_t ext, gap; /* Initialize. */ @@ -59,17 +59,17 @@ mca_coll_basic_alltoallw_intra_inplace(void *rbuf, int *rcounts, const int *rdis /* Find the largest receive amount */ for (i = 0, max_size = 0 ; i < size ; ++i) { - ompi_datatype_type_extent (rdtypes[i], &ext); - ext *= rcounts[i]; + ext = opal_datatype_span(&rdtypes[i]->super, rcounts[i], &gap); max_size = ext > max_size ? ext : max_size; } /* Allocate a temporary buffer */ - tmp_buffer = calloc (max_size, 1); + tmp_buffer = save_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } + tmp_buffer -= gap; /* in-place alltoallw slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { @@ -129,7 +129,12 @@ mca_coll_basic_alltoallw_intra_inplace(void *rbuf, int *rcounts, const int *rdis error_hndl: /* Free the temporary buffer */ - free (tmp_buffer); + free (save_buffer); + if( MPI_SUCCESS != err ) { /* Free the requests. */ + if( NULL != reqs ) { + mca_coll_basic_free_reqs(basic_module->mccb_reqs, 2); + } + } /* All done */ diff --git a/ompi/mca/coll/basic/coll_basic_exscan.c b/ompi/mca/coll/basic/coll_basic_exscan.c index 97142a496e..8901258ea7 100644 --- a/ompi/mca/coll/basic/coll_basic_exscan.c +++ b/ompi/mca/coll/basic/coll_basic_exscan.c @@ -47,7 +47,7 @@ mca_coll_basic_exscan_intra(void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { int size, rank, err; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t dsize, gap; char *free_buffer = NULL; char *reduce_buffer = NULL; @@ -81,15 +81,14 @@ mca_coll_basic_exscan_intra(void *sbuf, void *rbuf, int count, /* Get a temporary buffer to perform the reduction into. Rationale * for malloc'ing this size is provided in coll_basic_reduce.c. */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + dsize = opal_datatype_span(&dtype->super, count, &gap); - free_buffer = (char*)malloc(true_extent + (count - 1) * extent); + free_buffer = (char*)malloc(dsize); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - reduce_buffer = free_buffer - lb; - err = ompi_datatype_copy_content_same_ddt(dtype, count, + reduce_buffer = free_buffer - gap; + err = ompi_datatype_copy_content_same_ddt(dtype, count, reduce_buffer, (char*)sbuf); /* Receive the reduced value from the prior rank */ diff --git a/ompi/mca/coll/basic/coll_basic_reduce.c b/ompi/mca/coll/basic/coll_basic_reduce.c index 203cd260bd..cf4b716cd2 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce.c +++ b/ompi/mca/coll/basic/coll_basic_reduce.c @@ -325,7 +325,7 @@ mca_coll_basic_reduce_log_intra(void *sbuf, void *rbuf, int count, { int i, size, rank, vrank; int err, peer, dim, mask; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t lb, extent, dsize, gap; char *free_buffer = NULL; char *free_rbuf = NULL; char *pml_buffer = NULL; @@ -353,14 +353,14 @@ mca_coll_basic_reduce_log_intra(void *sbuf, void *rbuf, int count, * rationale above. */ ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + dsize = opal_datatype_span(&dtype->super, count, &gap); - free_buffer = (char*)malloc(true_extent + (count - 1) * extent); + free_buffer = (char*)malloc(dsize); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - pml_buffer = free_buffer - lb; + pml_buffer = free_buffer - gap; /* read the comment about commutative operations (few lines down * the page) */ if (ompi_op_is_commute(op)) { @@ -371,12 +371,12 @@ mca_coll_basic_reduce_log_intra(void *sbuf, void *rbuf, int count, * rationale above. */ if (MPI_IN_PLACE == sbuf) { - inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); + inplace_temp = (char*)malloc(dsize); if (NULL == inplace_temp) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; } - sbuf = inplace_temp - lb; + sbuf = inplace_temp - gap; err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, (char*)rbuf); } snd_buffer = (char*)sbuf; @@ -385,12 +385,12 @@ mca_coll_basic_reduce_log_intra(void *sbuf, void *rbuf, int count, /* root is the only one required to provide a valid rbuf. * Assume rbuf is invalid for all other ranks, so fix it up * here to be valid on all non-leaf ranks */ - free_rbuf = (char*)malloc(true_extent + (count - 1) * extent); + free_rbuf = (char*)malloc(dsize); if (NULL == free_rbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; } - rbuf = free_rbuf - lb; + rbuf = free_rbuf - gap; } /* Loop over cube dimensions. High processes send to low ones in the @@ -521,7 +521,7 @@ mca_coll_basic_reduce_lin_inter(void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { int i, err, size; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t dsize, gap; char *free_buffer = NULL; char *pml_buffer = NULL; @@ -538,14 +538,13 @@ mca_coll_basic_reduce_lin_inter(void *sbuf, void *rbuf, int count, MCA_PML_BASE_SEND_STANDARD, comm)); } else { /* Root receives and reduces messages */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + dsize = opal_datatype_span(&dtype->super, count, &gap); - free_buffer = (char*)malloc(true_extent + (count - 1) * extent); + free_buffer = (char*)malloc(dsize); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - pml_buffer = free_buffer - lb; + pml_buffer = free_buffer - gap; /* Initialize the receive buffer. */ diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter.c index b5dd0656b9..d64abda3b2 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce_scatter.c +++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter.c @@ -71,7 +71,7 @@ mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts, mca_coll_base_module_t *module) { int i, rank, size, count, err = OMPI_SUCCESS; - ptrdiff_t true_lb, true_extent, lb, extent, buf_size; + ptrdiff_t extent, buf_size, gap; int *disps = NULL; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; @@ -96,9 +96,8 @@ mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts, } /* get datatype information */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - buf_size = true_extent + (count - 1) * extent; + ompi_datatype_type_extent(dtype, &extent); + buf_size = opal_datatype_span(&dtype->super, count, &gap); /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { @@ -111,7 +110,7 @@ mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts, /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ recv_buf_free = (char*) malloc(buf_size); - recv_buf = recv_buf_free - lb; + recv_buf = recv_buf_free - gap; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; @@ -119,7 +118,7 @@ mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts, /* allocate temporary buffer for results */ result_buf_free = (char*) malloc(buf_size); - result_buf = result_buf_free - lb; + result_buf = result_buf_free - gap; /* copy local buffer into the temporary results */ err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype); @@ -323,7 +322,7 @@ mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts, /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ recv_buf_free = (char*) malloc(buf_size); - recv_buf = recv_buf_free - lb; + recv_buf = recv_buf_free - gap; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c index 69f1088572..9e6854bfb9 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c +++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c @@ -58,7 +58,7 @@ mca_coll_basic_reduce_scatter_block_intra(void *sbuf, void *rbuf, int rcount, mca_coll_base_module_t *module) { int rank, size, count, err = OMPI_SUCCESS; - ptrdiff_t true_lb, true_extent, lb, extent, buf_size; + ptrdiff_t extent, buf_size, gap; char *recv_buf = NULL, *recv_buf_free = NULL; /* Initialize */ @@ -72,9 +72,8 @@ mca_coll_basic_reduce_scatter_block_intra(void *sbuf, void *rbuf, int rcount, } /* get datatype information */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - buf_size = true_extent + (count - 1) * extent; + ompi_datatype_type_extent(dtype, &extent); + buf_size = opal_datatype_span(&dtype->super, count, &gap); /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { @@ -85,7 +84,7 @@ mca_coll_basic_reduce_scatter_block_intra(void *sbuf, void *rbuf, int rcount, /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ recv_buf_free = (char*) malloc(buf_size); - recv_buf = recv_buf_free - lb; + recv_buf = recv_buf_free - gap; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; diff --git a/ompi/mca/coll/basic/coll_basic_scan.c b/ompi/mca/coll/basic/coll_basic_scan.c index 627e9a3067..c797bcee1e 100644 --- a/ompi/mca/coll/basic/coll_basic_scan.c +++ b/ompi/mca/coll/basic/coll_basic_scan.c @@ -45,7 +45,7 @@ mca_coll_basic_scan_intra(void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { int size, rank, err; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t dsize, gap; char *free_buffer = NULL; char *pml_buffer = NULL; @@ -72,14 +72,11 @@ mca_coll_basic_scan_intra(void *sbuf, void *rbuf, int count, * listed in coll_basic_reduce.c. Use this temporary buffer to * receive into, later. */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - - free_buffer = (char*)malloc(true_extent + (count - 1) * extent); + dsize = opal_datatype_span(&dtype->super, count, &gap); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - pml_buffer = free_buffer - lb; + pml_buffer = free_buffer - gap; /* Copy the send buffer into the receive buffer. */ diff --git a/ompi/mca/coll/cuda/coll_cuda_allreduce.c b/ompi/mca/coll/cuda/coll_cuda_allreduce.c index 58d322ba35..76d4743cb9 100644 --- a/ompi/mca/coll/cuda/coll_cuda_allreduce.c +++ b/ompi/mca/coll/cuda/coll_cuda_allreduce.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 The University of Tennessee and The University + * Copyright (c) 2014-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. @@ -34,15 +34,14 @@ mca_coll_cuda_allreduce(void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; - ptrdiff_t true_lb, true_extent, lb, extent; - char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2; + ptrdiff_t gap; + char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t bufsize; int rc; - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - bufsize = true_extent + (ptrdiff_t)(count - 1) * extent; + bufsize = opal_datatype_span(&dtype->super, count, &gap); + if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(bufsize); if (NULL == sbuf1) { @@ -50,7 +49,7 @@ mca_coll_cuda_allreduce(void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize); sbuf2 = sbuf; /* save away original buffer */ - sbuf = sbuf1 - true_lb; + sbuf = sbuf1 - gap; } if (opal_cuda_check_bufs(rbuf, NULL)) { @@ -61,7 +60,7 @@ mca_coll_cuda_allreduce(void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize); rbuf2 = rbuf; /* save away original buffer */ - rbuf = rbuf1 - true_lb; + rbuf = rbuf1 - gap; } rc = s->c_coll.coll_allreduce(sbuf, rbuf, count, dtype, op, comm, s->c_coll.coll_allreduce_module); if (NULL != sbuf1) { diff --git a/ompi/mca/coll/cuda/coll_cuda_exscan.c b/ompi/mca/coll/cuda/coll_cuda_exscan.c index b93a822083..bff1a3c193 100644 --- a/ompi/mca/coll/cuda/coll_cuda_exscan.c +++ b/ompi/mca/coll/cuda/coll_cuda_exscan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 The University of Tennessee and The University + * Copyright (c) 2014-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. @@ -26,15 +26,14 @@ int mca_coll_cuda_exscan(void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; - ptrdiff_t true_lb, true_extent, lb, extent; - char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2; + ptrdiff_t gap; + char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t bufsize; int rc; - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - bufsize = true_extent + (ptrdiff_t)(count - 1) * extent; + bufsize = opal_datatype_span(&dtype->super, count, &gap); + if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(bufsize); if (NULL == sbuf1) { @@ -42,7 +41,7 @@ int mca_coll_cuda_exscan(void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize); sbuf2 = sbuf; /* save away original buffer */ - sbuf = sbuf1 - true_lb; + sbuf = sbuf1 - gap; } if (opal_cuda_check_bufs(rbuf, NULL)) { @@ -53,7 +52,7 @@ int mca_coll_cuda_exscan(void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize); rbuf2 = rbuf; /* save away original buffer */ - rbuf = rbuf1 - true_lb; + rbuf = rbuf1 - gap; } rc = s->c_coll.coll_exscan(sbuf, rbuf, count, dtype, op, comm, diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce.c b/ompi/mca/coll/cuda/coll_cuda_reduce.c index 2c5b40fdde..50a887b46d 100644 --- a/ompi/mca/coll/cuda/coll_cuda_reduce.c +++ b/ompi/mca/coll/cuda/coll_cuda_reduce.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. @@ -34,15 +34,15 @@ mca_coll_cuda_reduce(void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; - ptrdiff_t true_lb, true_extent, lb, extent; - char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2; + ptrdiff_t gap; + char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t bufsize; int rc; - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - bufsize = true_extent + (ptrdiff_t)(count - 1) * extent; + bufsize = opal_datatype_span(&dtype->super, count, &gap); + + if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(bufsize); if (NULL == sbuf1) { @@ -50,7 +50,7 @@ mca_coll_cuda_reduce(void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize); sbuf2 = sbuf; /* save away original buffer */ - sbuf = sbuf1 - lb; + sbuf = sbuf1 - gap; } if (opal_cuda_check_bufs(rbuf, NULL)) { @@ -61,7 +61,7 @@ mca_coll_cuda_reduce(void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize); rbuf2 = rbuf; /* save away original buffer */ - rbuf = rbuf1 - lb; + rbuf = rbuf1 - gap; } rc = s->c_coll.coll_reduce((void *) sbuf, rbuf, count, dtype, op, root, comm, diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c index 55659e4f47..fc45e73b07 100644 --- a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c +++ b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 The University of Tennessee and The University + * Copyright (c) 2014-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. @@ -38,16 +38,16 @@ mca_coll_cuda_reduce_scatter_block(void *sbuf, void *rbuf, int rcount, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; - ptrdiff_t true_lb, true_extent, lb, extent; - char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2; + ptrdiff_t gap; + char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t sbufsize, rbufsize; int rc; - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - sbufsize = (true_extent + (ptrdiff_t)(rcount - 1) * extent) * ompi_comm_size(comm); - rbufsize = true_extent + (ptrdiff_t)(rcount - 1) * extent; + rbufsize = opal_datatype_span(&dtype->super, rcount, &gap); + + sbufsize = rbufsize * ompi_comm_size(comm); + if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(sbufsize); if (NULL == sbuf1) { @@ -55,7 +55,7 @@ mca_coll_cuda_reduce_scatter_block(void *sbuf, void *rbuf, int rcount, } opal_cuda_memcpy_sync(sbuf1, sbuf, sbufsize); sbuf2 = sbuf; /* save away original buffer */ - sbuf = sbuf1 - true_lb; + sbuf = sbuf1 - gap; } if (opal_cuda_check_bufs(rbuf, NULL)) { @@ -66,7 +66,7 @@ mca_coll_cuda_reduce_scatter_block(void *sbuf, void *rbuf, int rcount, } opal_cuda_memcpy_sync(rbuf1, rbuf, rbufsize); rbuf2 = rbuf; /* save away original buffer */ - rbuf = rbuf1 - true_lb; + rbuf = rbuf1 - gap; } rc = s->c_coll.coll_reduce_scatter_block(sbuf, rbuf, rcount, dtype, op, comm, s->c_coll.coll_reduce_scatter_block_module); diff --git a/ompi/mca/coll/cuda/coll_cuda_scan.c b/ompi/mca/coll/cuda/coll_cuda_scan.c index 62f01296e0..8444701e7f 100644 --- a/ompi/mca/coll/cuda/coll_cuda_scan.c +++ b/ompi/mca/coll/cuda/coll_cuda_scan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 The University of Tennessee and The University + * Copyright (c) 2014-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. @@ -33,15 +33,14 @@ int mca_coll_cuda_scan(void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; - ptrdiff_t true_lb, true_extent, lb, extent; - char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2; + ptrdiff_t gap; + char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t bufsize; int rc; - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - bufsize = true_extent + (ptrdiff_t)(count - 1) * extent; + bufsize = opal_datatype_span(&dtype->super, count, &gap); + if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(bufsize); if (NULL == sbuf1) { @@ -49,7 +48,7 @@ int mca_coll_cuda_scan(void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize); sbuf2 = sbuf; /* save away original buffer */ - sbuf = sbuf1 - true_lb; + sbuf = sbuf1 - gap; } if (opal_cuda_check_bufs(rbuf, NULL)) { @@ -60,7 +59,7 @@ int mca_coll_cuda_scan(void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize); rbuf2 = rbuf; /* save away original buffer */ - rbuf = rbuf1 - true_lb; + rbuf = rbuf1 - gap; } rc = s->c_coll.coll_scan(sbuf, rbuf, count, dtype, op, comm, s->c_coll.coll_scan_module); diff --git a/ompi/mca/coll/sm/coll_sm_reduce.c b/ompi/mca/coll/sm/coll_sm_reduce.c index 77260b8bb0..658e1e3926 100644 --- a/ompi/mca/coll/sm/coll_sm_reduce.c +++ b/ompi/mca/coll/sm/coll_sm_reduce.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -187,9 +187,9 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count, size_t total_size, max_data, bytes; mca_coll_sm_in_use_flag_t *flag; mca_coll_sm_data_index_t *index; - size_t ddt_size; + size_t ddt_size, segsize; size_t segment_ddt_count, segment_ddt_bytes, zero = 0; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t extent, gap; /* Setup some identities */ @@ -205,10 +205,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count, /* ddt_size is the packed size (e.g., MPI_SHORT_INT is 6) */ ompi_datatype_type_size(dtype, &ddt_size); /* extent is from lb to ub (e.g., MPI_SHORT_INT is 8) */ - ompi_datatype_get_extent(dtype, &lb, &extent); - /* true_extent is extent of actual type map, ignoring lb and ub - (e.g., MPI_SHORT_INT is 8) */ - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + ompi_datatype_type_extent(dtype, &extent); segment_ddt_count = mca_coll_sm_component.sm_fragment_size / ddt_size; iov.iov_len = segment_ddt_bytes = segment_ddt_count * ddt_size; total_size = ddt_size * count; @@ -265,14 +262,15 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count, entire user buffer) -- we only need to be able to hold "segment_ddt_count" instances (i.e., the number of instances that can be held in a single fragment) */ - - free_buffer = (char*)malloc(true_extent + - (segment_ddt_count - 1) * extent); + + segsize = opal_datatype_span(&dtype->super, segment_ddt_count, &gap); + + free_buffer = (char*)malloc(segsize); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - reduce_temp_buffer = free_buffer - lb; - + reduce_temp_buffer = free_buffer - gap; + /* Trickery here: we use a potentially smaller count than the user count -- use the largest count that is <= user's count that will fit within a single segment. */ @@ -312,15 +310,16 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count, as the sbuf */ if (MPI_IN_PLACE == sbuf && (size - 1) != rank) { - inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); + segsize = opal_datatype_span(&dtype->super, count, &gap); + inplace_temp = (char*)malloc(segsize); if (NULL == inplace_temp) { if (NULL != free_buffer) { free(free_buffer); } return OMPI_ERR_OUT_OF_RESOURCE; } - sbuf = inplace_temp - lb; - ompi_datatype_copy_content_same_ddt(dtype, count, (char *) sbuf, (char *) rbuf); + sbuf = inplace_temp - gap; + ompi_datatype_copy_content_same_ddt(dtype, count, (char *)sbuf, (char *)rbuf); } else { inplace_temp = NULL; } diff --git a/ompi/mca/coll/tuned/coll_tuned_allgather.c b/ompi/mca/coll/tuned/coll_tuned_allgather.c index 6bc22d84e5..327c897453 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allgather.c +++ b/ompi/mca/coll/tuned/coll_tuned_allgather.c @@ -187,19 +187,16 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, - copy blocks from shift buffer starting at block [rank] in rbuf. */ if (0 != rank) { - ptrdiff_t true_extent, true_lb; char *free_buf = NULL, *shift_buf = NULL; + ptrdiff_t span, gap; - err = ompi_datatype_get_true_extent(rdtype, &true_lb, &true_extent); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + span = opal_datatype_span(&rdtype->super, (size - rank) * rcount, &gap); - free_buf = (char*) calloc(((true_extent - true_lb + - ((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount - 1) * rext)), - sizeof(char)); + free_buf = (char*)calloc(span, sizeof(char)); if (NULL == free_buf) { line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl; } - shift_buf = free_buf - rlb; + shift_buf = free_buf - gap; tmpsend = (char*) rbuf; err = ompi_datatype_copy_content_same_ddt(rdtype, ((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount), diff --git a/ompi/mca/coll/tuned/coll_tuned_allreduce.c b/ompi/mca/coll/tuned/coll_tuned_allreduce.c index dab78a1637..050617260d 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allreduce.c +++ b/ompi/mca/coll/tuned/coll_tuned_allreduce.c @@ -152,8 +152,8 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, int ret, line, rank, size, adjsize, remote, distance; int newrank, newremote, extra_ranks; char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf = NULL; - ptrdiff_t true_lb, true_extent, lb, extent; ompi_request_t *reqs[2] = {NULL, NULL}; + OPAL_PTRDIFF_TYPE span, gap; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -171,12 +171,8 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, } /* Allocate and initialize temporary send buffer */ - ret = ompi_datatype_get_extent(dtype, &lb, &extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - - inplacebuf = (char*) malloc(true_extent + (ptrdiff_t)(count - 1) * extent); + span = opal_datatype_span(&dtype->super, count, &gap); + inplacebuf = (char*) malloc(span); if (NULL == inplacebuf) { ret = -1; line = __LINE__; goto error_hndl; } if (MPI_IN_PLACE == sbuf) { @@ -648,9 +644,9 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count int segcount, max_segcount, num_phases, phase, block_count, inbi; size_t typelng; char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL}; - ptrdiff_t true_lb, true_extent, lb, extent; ptrdiff_t block_offset, max_real_segsize; ompi_request_t *reqs[2] = {NULL, NULL}; + OPAL_PTRDIFF_TYPE lb, extent, gap; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -668,10 +664,6 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count } /* Determine segment count based on the suggested segment size */ - ret = ompi_datatype_get_extent(dtype, &lb, &extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } ret = ompi_datatype_type_size( dtype, &typelng); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } segcount = count; @@ -704,7 +696,9 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count early_blockcount, late_blockcount ) COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi, max_segcount, k) - max_real_segsize = true_extent + (ptrdiff_t)(max_segcount - 1) * extent; + ret = ompi_datatype_get_extent(dtype, &lb, &extent); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + max_real_segsize = opal_datatype_span(&dtype->super, max_segcount, &gap); /* Allocate and initialize temporary buffers */ inbuf[0] = (char*)malloc(max_real_segsize); @@ -759,8 +753,8 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count block_count = ((rank < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) - phase_count = ((phase < split_phase)? - (early_phase_segcount) : (late_phase_segcount)); + phase_count = ((phase < split_phase)? + (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall.c b/ompi/mca/coll/tuned/coll_tuned_alltoall.c index 2781deec41..20b83d9916 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoall.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoall.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All Rights * reserved. - * Copyright (c) 2014 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ @@ -63,10 +63,10 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount, { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; int i, j, size, rank, err=MPI_SUCCESS; + OPAL_PTRDIFF_TYPE gap; MPI_Request *preq; - char *tmp_buffer; + char *allocated_buffer = NULL, *tmp_buffer; size_t max_size; - ptrdiff_t ext; /* Initialize. */ @@ -79,14 +79,14 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount, } /* Find the largest receive amount */ - ompi_datatype_type_extent (rdtype, &ext); - max_size = ext * rcount; + max_size = opal_datatype_span(&rdtype->super, rcount, &gap); /* Allocate a temporary buffer */ - tmp_buffer = calloc (max_size, 1); - if (NULL == tmp_buffer) { + allocated_buffer = calloc (max_size, 1); + if (NULL == allocated_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } + tmp_buffer = allocated_buffer - gap; /* in-place alltoall slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { @@ -139,7 +139,7 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount, error_hndl: /* Free the temporary buffer */ - free (tmp_buffer); + free (allocated_buffer); /* All done */ @@ -214,7 +214,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, int i, k, line = -1, rank, size, err = 0, weallocated = 0; int sendto, recvfrom, distance, *displs = NULL, *blen = NULL; char *tmpbuf = NULL, *tmpbuf_free = NULL; - ptrdiff_t rlb, slb, tlb, sext, rext, tsext; + OPAL_PTRDIFF_TYPE sext, rext, span, gap; struct ompi_datatype_t *new_ddt; #ifdef blahblah mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; @@ -232,15 +232,12 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:alltoall_intra_bruck rank %d", rank)); - err = ompi_datatype_get_extent (sdtype, &slb, &sext); - if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } - - err = ompi_datatype_get_true_extent(sdtype, &tlb, &tsext); - if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_type_extent (sdtype, &sext); - err = ompi_datatype_get_extent (rdtype, &rlb, &rext); + err = ompi_datatype_type_extent (rdtype, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } + span = opal_datatype_span(&sdtype->super, size * scount, &gap); #ifdef blahblah /* try and SAVE memory by using the data segment hung off @@ -263,9 +260,9 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, #endif /* tmp buffer allocation for message data */ - tmpbuf_free = (char *) malloc(tsext + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sext); + tmpbuf_free = (char *) malloc(span); if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; } - tmpbuf = tmpbuf_free - slb; + tmpbuf = tmpbuf_free - gap; /* Step 1 - local rotation - shift up by rank */ err = ompi_datatype_copy_content_same_ddt (sdtype, diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoallv.c b/ompi/mca/coll/tuned/coll_tuned_alltoallv.c index 05e47bd361..5c9f4b1b21 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoallv.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoallv.c @@ -60,7 +60,7 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con MPI_Request *preq; char *tmp_buffer; size_t max_size, rdtype_size; - ptrdiff_t ext; + OPAL_PTRDIFF_TYPE ext, gap; /* Initialize. */ @@ -76,16 +76,17 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); for (i = 0, max_size = 0 ; i < size ; ++i) { - size_t size = ext * rcounts[i]; - + size_t size = opal_datatype_span(&rdtype->super, rcounts[i], &gap); max_size = size > max_size ? size : max_size; } + /* The gap will always be the same as we are working on the same datatype */ /* Allocate a temporary buffer */ tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } + tmp_buffer += gap; /* in-place alltoallv slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { diff --git a/ompi/mca/coll/tuned/coll_tuned_gather.c b/ompi/mca/coll/tuned/coll_tuned_gather.c index 90c224df4b..d99fdb1eeb 100644 --- a/ompi/mca/coll/tuned/coll_tuned_gather.c +++ b/ompi/mca/coll/tuned/coll_tuned_gather.c @@ -64,8 +64,8 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount, char *ptmp = NULL, *tempbuf = NULL; ompi_coll_tree_t* bmtree; MPI_Status status; - MPI_Aint sextent, slb, strue_lb, strue_extent; - MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; + MPI_Aint sextent, sgap, ssize; + MPI_Aint rextent, rgap, rsize; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; @@ -79,14 +79,14 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount, COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); bmtree = data->cached_in_order_bmtree; - ompi_datatype_get_extent(sdtype, &slb, &sextent); - ompi_datatype_get_true_extent(sdtype, &strue_lb, &strue_extent); + ompi_datatype_type_extent(sdtype, &sextent); + ompi_datatype_type_extent(rdtype, &rextent); + ssize = opal_datatype_span(&sdtype->super, scount * size, &sgap); + rsize = opal_datatype_span(&rdtype->super, rcount * size, &rgap); vrank = (rank - root + size) % size; if (rank == root) { - ompi_datatype_get_extent(rdtype, &rlb, &rextent); - ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent); if (0 == root){ /* root on 0, just use the recv buffer */ ptmp = (char *) rbuf; @@ -98,12 +98,12 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount, } else { /* root is not on 0, allocate temp buffer for recv, * rotate data at the end */ - tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent); + tempbuf = (char *) malloc(rsize); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } - ptmp = tempbuf - rlb; + ptmp = tempbuf - rgap; if (sbuf != MPI_IN_PLACE) { /* copy from sbuf to temp buffer */ err = ompi_datatype_sndrcv(sbuf, scount, sdtype, @@ -121,12 +121,12 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount, /* other non-leaf nodes, allocate temp buffer for data received from * children, the most we need is half of the total data elements due * to the property of binimoal tree */ - tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent); + tempbuf = (char *) malloc(ssize); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } - ptmp = tempbuf - slb; + ptmp = tempbuf - sgap; /* local copy to tempbuf */ err = ompi_datatype_sndrcv(sbuf, scount, sdtype, ptmp, scount, sdtype); diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce.c b/ompi/mca/coll/tuned/coll_tuned_reduce.c index bca61a1617..2764c1ca7f 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce.c @@ -13,6 +13,8 @@ * Copyright (c) 2013 Los Alamos National Security, LLC. All Rights * reserved. * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -74,7 +76,7 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL}; char *accumbuf = NULL, *accumbuf_free = NULL; char *local_op_buffer = NULL, *sendtmpbuf = NULL; - ptrdiff_t extent, lower_bound, segment_increment; + ptrdiff_t extent, size, gap, segment_increment; size_t typelng; ompi_request_t* reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; int num_segments, line, ret, segindex, i, rank; @@ -84,9 +86,8 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c * Determine number of segments and number of elements * sent per operation */ - ompi_datatype_get_extent( datatype, &lower_bound, &extent ); - ompi_datatype_type_size( datatype, &typelng ); - num_segments = (int)(((size_t)original_count + (size_t)count_by_segment - (size_t)1) / (size_t)count_by_segment); + ompi_datatype_type_extent( datatype, &extent ); + num_segments = (original_count + count_by_segment - 1) / count_by_segment; segment_increment = (ptrdiff_t)count_by_segment * extent; sendtmpbuf = (char*) sendbuf; @@ -103,21 +104,19 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c /* non-leaf nodes - wait for children to send me data & forward up (if needed) */ if( tree->tree_nextsize > 0 ) { - ptrdiff_t true_lower_bound, true_extent, real_segment_size; - ompi_datatype_get_true_extent( datatype, &true_lower_bound, - &true_extent ); + ptrdiff_t real_segment_size; /* handle non existant recv buffer (i.e. its NULL) and protect the recv buffer on non-root nodes */ accumbuf = (char*)recvbuf; if( (NULL == accumbuf) || (root != rank) ) { /* Allocate temporary accumulator buffer. */ - accumbuf_free = (char*)malloc(true_extent + - (ptrdiff_t)(original_count - 1) * extent); + size = opal_datatype_span(&datatype->super, original_count, &gap); + accumbuf_free = (char*)malloc(size); if (accumbuf_free == NULL) { line = __LINE__; ret = -1; goto error_hndl; } - accumbuf = accumbuf_free - lower_bound; + accumbuf = accumbuf_free - gap; } /* If this is a non-commutative operation we must copy @@ -128,12 +127,12 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c (char*)sendtmpbuf); } /* Allocate two buffers for incoming segments */ - real_segment_size = true_extent + (ptrdiff_t)(count_by_segment - 1) * extent; + real_segment_size = opal_datatype_span(&datatype->super, count_by_segment, &gap); inbuf_free[0] = (char*) malloc(real_segment_size); if( inbuf_free[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } - inbuf[0] = inbuf_free[0] - lower_bound; + inbuf[0] = inbuf_free[0] - gap; /* if there is chance to overlap communication - allocate second buffer */ if( (num_segments > 1) || (tree->tree_nextsize > 1) ) { @@ -141,7 +140,7 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c if( inbuf_free[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } - inbuf[1] = inbuf_free[1] - lower_bound; + inbuf[1] = inbuf_free[1] - gap; } /* reset input buffer index and receive count */ @@ -538,14 +537,13 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, use_this_sendbuf = sendbuf; use_this_recvbuf = recvbuf; if (io_root != root) { - ptrdiff_t tlb, text, lb, ext; + ptrdiff_t dsize, gap; char *tmpbuf = NULL; - ompi_datatype_get_extent(datatype, &lb, &ext); - ompi_datatype_get_true_extent(datatype, &tlb, &text); + dsize = opal_datatype_span(&datatype->super, count, &gap); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { - tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext); + tmpbuf = (char *) malloc(dsize); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } @@ -554,7 +552,7 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, (char*)recvbuf); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { - tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext); + tmpbuf = (char *) malloc(dsize); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } @@ -606,8 +604,6 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, * GEF Oct05 after asking Jeff. */ -/* copied function (with appropriate renaming) starts here */ - /* * reduce_lin_intra * @@ -624,7 +620,7 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { int i, rank, err, size; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t extent, dsize, gap; char *free_buffer = NULL, *pml_buffer = NULL; char *inplace_temp = NULL, *inbuf; @@ -648,25 +644,25 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, extent and true extent */ /* for reducing buffer allocation lengths.... */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + dsize = opal_datatype_span(&dtype->super, count, &gap); + ompi_datatype_type_extent(dtype, &extent); if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; - inplace_temp = (char*)malloc(true_extent + (ptrdiff_t)(count - 1) * extent); + inplace_temp = (char*)malloc(dsize); if (NULL == inplace_temp) { return OMPI_ERR_OUT_OF_RESOURCE; } - rbuf = inplace_temp - lb; + rbuf = inplace_temp - gap; } if (size > 1) { - free_buffer = (char*)malloc(true_extent + (ptrdiff_t)(count - 1) * extent); + free_buffer = (char*)malloc(dsize); if (NULL == free_buffer) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } - pml_buffer = free_buffer - lb; + pml_buffer = free_buffer - gap; } /* Initialize the receive buffer. */ diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter.c b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter.c index f1f0555c72..f89b8dc655 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter.c @@ -91,13 +91,11 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, if (root == rank) { /* We must allocate temporary receive buffer on root to ensure that rbuf is big enough */ - ptrdiff_t lb, extent, tlb, textent; + ptrdiff_t dsize, gap; + dsize = opal_datatype_span(&dtype->super, total_count, &gap); - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &tlb, &textent); - - tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent); - tmprbuf = tmprbuf_free - lb; + tmprbuf_free = (char*) malloc(dsize); + tmprbuf = tmprbuf_free - gap; } err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count, dtype, op, root, comm, comm->c_coll.coll_reduce_module); @@ -149,7 +147,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, { int i, rank, size, count, err = OMPI_SUCCESS; int tmp_size, remain = 0, tmp_rank, *disps = NULL; - ptrdiff_t true_lb, true_extent, lb, extent, buf_size; + ptrdiff_t extent, buf_size, gap; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; @@ -176,9 +174,8 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, } /* get datatype information */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - buf_size = true_extent + (ptrdiff_t)(count - 1) * extent; + ompi_datatype_type_extent(dtype, &extent); + buf_size = opal_datatype_span(&dtype->super, count, &gap); /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { @@ -187,7 +184,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, /* Allocate temporary receive buffer. */ recv_buf_free = (char*) malloc(buf_size); - recv_buf = recv_buf_free - lb; + recv_buf = recv_buf_free - gap; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; @@ -195,7 +192,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, /* allocate temporary buffer for results */ result_buf_free = (char*) malloc(buf_size); - result_buf = result_buf_free - lb; + result_buf = result_buf_free - gap; /* copy local buffer into the temporary results */ err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype); @@ -474,9 +471,8 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, int inbi, *displs = NULL; char *tmpsend = NULL, *tmprecv = NULL, *accumbuf = NULL, *accumbuf_free = NULL; char *inbuf_free[2] = {NULL, NULL}, *inbuf[2] = {NULL, NULL}; - ptrdiff_t true_lb, true_extent, lb, extent, max_real_segsize; + ptrdiff_t extent, max_real_segsize, dsize, gap; ompi_request_t *reqs[2] = {NULL, NULL}; - size_t typelng; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -515,26 +511,23 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, rbuf can be of rcounts[rank] size. - up to two temporary buffers used for communication/computation overlap. */ - ret = ompi_datatype_get_extent(dtype, &lb, &extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_type_size( dtype, &typelng); + ret = ompi_datatype_type_extent(dtype, &extent); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - max_real_segsize = true_extent + (ptrdiff_t)(max_block_count - 1) * extent; + max_real_segsize = opal_datatype_span(&dtype->super, max_block_count, &gap); + dsize = opal_datatype_span(&dtype->super, total_count, &gap); - accumbuf_free = (char*)malloc(true_extent + (ptrdiff_t)(total_count - 1) * extent); + accumbuf_free = (char*)malloc(dsize); if (NULL == accumbuf_free) { ret = -1; line = __LINE__; goto error_hndl; } - accumbuf = accumbuf_free - lb; + accumbuf = accumbuf_free - gap; inbuf_free[0] = (char*)malloc(max_real_segsize); if (NULL == inbuf_free[0]) { ret = -1; line = __LINE__; goto error_hndl; } - inbuf[0] = inbuf_free[0] - lb; + inbuf[0] = inbuf_free[0] - gap; if (size > 2) { inbuf_free[1] = (char*)malloc(max_real_segsize); if (NULL == inbuf_free[1]) { ret = -1; line = __LINE__; goto error_hndl; } - inbuf[1] = inbuf_free[1] - lb; + inbuf[1] = inbuf_free[1] - gap; } /* Handle MPI_IN_PLACE for size > 1 */ diff --git a/ompi/mca/coll/tuned/coll_tuned_scatter.c b/ompi/mca/coll/tuned/coll_tuned_scatter.c index c6125c905b..84729a6ec0 100644 --- a/ompi/mca/coll/tuned/coll_tuned_scatter.c +++ b/ompi/mca/coll/tuned/coll_tuned_scatter.c @@ -61,10 +61,9 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, char *ptmp, *tempbuf = NULL; ompi_coll_tree_t* bmtree; MPI_Status status; - MPI_Aint sextent, slb, strue_lb, strue_extent; - MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + ptrdiff_t sextent, rextent, ssize, rsize, sgap, rgap; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -76,10 +75,11 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); bmtree = data->cached_in_order_bmtree; - ompi_datatype_get_extent(sdtype, &slb, &sextent); - ompi_datatype_get_true_extent(sdtype, &strue_lb, &strue_extent); - ompi_datatype_get_extent(rdtype, &rlb, &rextent); - ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent); + ompi_datatype_type_extent(sdtype, &sextent); + ompi_datatype_type_extent(rdtype, &rextent); + + ssize = opal_datatype_span(&sdtype->super, scount * size, &sgap); + rsize = opal_datatype_span(&rdtype->super, rcount * size, &rgap); vrank = (rank - root + size) % size; ptmp = (char *) rbuf; /* by default suppose leaf nodes, just use rbuf */ @@ -96,12 +96,12 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, } } else { /* root is not on 0, allocate temp buffer for send */ - tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent); + tempbuf = (char *) malloc(ssize); if (NULL == tempbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } - ptmp = tempbuf - slb; + ptmp = tempbuf - sgap; /* and rotate data so they will eventually in the right place */ err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)(size - root), @@ -124,12 +124,12 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, } else if (!(vrank % 2)) { /* non-root, non-leaf nodes, allocte temp buffer for recv * the most we need is rcount*size/2 */ - tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent); + tempbuf = (char *) malloc(rsize); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } - ptmp = tempbuf - rlb; + ptmp = tempbuf - rgap; sdtype = rdtype; scount = rcount; @@ -219,7 +219,7 @@ ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount, mca_coll_base_module_t *module) { int i, rank, size, err; - ptrdiff_t lb, incr; + ptrdiff_t incr; char *ptmp; /* Initialize */ @@ -238,7 +238,7 @@ ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount, /* I am the root, loop sending data. */ - err = ompi_datatype_get_extent(sdtype, &lb, &incr); + err = ompi_datatype_type_extent(sdtype, &incr); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h index cf00a690c5..25f014ead0 100644 --- a/opal/datatype/opal_datatype.h +++ b/opal/datatype/opal_datatype.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2010 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -329,6 +329,25 @@ OPAL_DECLSPEC opal_datatype_t* opal_datatype_create_from_packed_description( void** packed_buffer, struct opal_proc_t* remote_processor ); +/* Compute the span in memory of count datatypes. This function help with temporary + * memory allocations for receiving already typed data (such as those used for reduce + * operations). This span is the distance between the minimum and the maximum byte + * in the memory layout of count datatypes, or in other terms the memory needed to + * allocate count times the datatype without the gap in the beginning and at the end. + * + * Returns: the memory span of count repetition of the datatype, and in the gap + * argument, the number of bytes of the gap at the beginning. + */ +static inline OPAL_PTRDIFF_TYPE +opal_datatype_span( const opal_datatype_t* pData, int64_t count, + OPAL_PTRDIFF_TYPE* gap) +{ + OPAL_PTRDIFF_TYPE extent = (pData->ub - pData->lb); + OPAL_PTRDIFF_TYPE true_extent = (pData->true_ub - pData->true_lb); + *gap = pData->true_lb; + return true_extent + (count - 1) * extent; +} + #if OPAL_ENABLE_DEBUG /* * Set a breakpoint to this function in your favorite debugger From 3d570ed51d10d57824a7dc5099703bcc4723f434 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Sun, 22 Nov 2015 16:25:11 -0600 Subject: [PATCH 02/11] Patch submitted by @ggouaillardet on ticket #1091. (cherry picked from commit open-mpi/ompi@688108cf7fbcce8eb093e58bd5f48408c12c80b0) --- ompi/mca/coll/basic/coll_basic_scan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ompi/mca/coll/basic/coll_basic_scan.c b/ompi/mca/coll/basic/coll_basic_scan.c index c797bcee1e..93d66a7deb 100644 --- a/ompi/mca/coll/basic/coll_basic_scan.c +++ b/ompi/mca/coll/basic/coll_basic_scan.c @@ -73,6 +73,7 @@ mca_coll_basic_scan_intra(void *sbuf, void *rbuf, int count, * receive into, later. */ dsize = opal_datatype_span(&dtype->super, count, &gap); + free_buffer = malloc(dsize); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } From 2207713d35f852acda619ac45793252491fb29ac Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 8 Jun 2016 16:48:00 +0900 Subject: [PATCH 03/11] coll/base: fix [all]reduce with non zero lower bound datatypes Offset temporary buffer when a non zero lower bound datatype is used. Thanks Hristo Iliev for the report (back-ported from commit open-mpi/ompi@0e393195d9f2373ffa9d59a240092f643117cd39) --- ompi/mca/coll/tuned/coll_tuned_allreduce.c | 11 +++++++---- ompi/mca/coll/tuned/coll_tuned_reduce.c | 21 +++++++++++---------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/ompi/mca/coll/tuned/coll_tuned_allreduce.c b/ompi/mca/coll/tuned/coll_tuned_allreduce.c index 050617260d..14dfd8dd45 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allreduce.c +++ b/ompi/mca/coll/tuned/coll_tuned_allreduce.c @@ -14,6 +14,8 @@ * Copyright (c) 2013 Los Alamos National Security, LLC. All Rights * reserved. * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -151,7 +153,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, { int ret, line, rank, size, adjsize, remote, distance; int newrank, newremote, extra_ranks; - char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf = NULL; + char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf_free = NULL, *inplacebuf; ompi_request_t *reqs[2] = {NULL, NULL}; OPAL_PTRDIFF_TYPE span, gap; @@ -172,8 +174,9 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, /* Allocate and initialize temporary send buffer */ span = opal_datatype_span(&dtype->super, count, &gap); - inplacebuf = (char*) malloc(span); - if (NULL == inplacebuf) { ret = -1; line = __LINE__; goto error_hndl; } + inplacebuf_free = (char*) malloc(span); + if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; } + inplacebuf = inplacebuf_free - gap; if (MPI_IN_PLACE == sbuf) { ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)rbuf); @@ -280,7 +283,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, if (ret < 0) { line = __LINE__; goto error_hndl; } } - if (NULL != inplacebuf) free(inplacebuf); + if (NULL != inplacebuf_free) free(inplacebuf_free); return MPI_SUCCESS; error_hndl: diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce.c b/ompi/mca/coll/tuned/coll_tuned_reduce.c index 2764c1ca7f..349927d557 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce.c @@ -77,7 +77,6 @@ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_c char *accumbuf = NULL, *accumbuf_free = NULL; char *local_op_buffer = NULL, *sendtmpbuf = NULL; ptrdiff_t extent, size, gap, segment_increment; - size_t typelng; ompi_request_t* reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; int num_segments, line, ret, segindex, i, rank; int recvcount, prevcount, inbi; @@ -508,6 +507,7 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, { int ret, rank, size, io_root, segcount = count; void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL; + char *tmpbuf_free = NULL; size_t typelng; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; @@ -538,24 +538,26 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, use_this_recvbuf = recvbuf; if (io_root != root) { ptrdiff_t dsize, gap; - char *tmpbuf = NULL; + char *tmpbuf; dsize = opal_datatype_span(&datatype->super, count, &gap); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { - tmpbuf = (char *) malloc(dsize); - if (NULL == tmpbuf) { + tmpbuf_free = (char *) malloc(dsize); + if (NULL == tmpbuf_free) { return MPI_ERR_INTERN; } + tmpbuf = tmpbuf_free - gap; ompi_datatype_copy_content_same_ddt(datatype, count, (char*)tmpbuf, (char*)recvbuf); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { - tmpbuf = (char *) malloc(dsize); - if (NULL == tmpbuf) { + tmpbuf_free = (char *) malloc(dsize); + if (NULL == tmpbuf_free) { return MPI_ERR_INTERN; } + tmpbuf = tmpbuf_free - gap; use_this_recvbuf = tmpbuf; } } @@ -575,9 +577,6 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { return ret; } - if (MPI_IN_PLACE == sendbuf) { - free(use_this_sendbuf); - } } else if (io_root == rank) { /* Send result from use_this_recvbuf to root */ @@ -585,9 +584,11 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { return ret; } - free(use_this_recvbuf); } } + if (NULL != tmpbuf_free) { + free(tmpbuf_free); + } return MPI_SUCCESS; } From 400ac5d36d91754ae319803d523cc4a8452bf679 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Tue, 5 Jul 2016 13:30:30 +0900 Subject: [PATCH 04/11] coll/base: fix non zero lower bound ddt handling in ompi_coll_base_reduce_intra_basic_linear() Thanks Yuki Matsumoto for the report (back-ported from commit open-mpi/ompi@c06fb04a9acec28a392d9ddb076c86174b0b060b) --- ompi/mca/coll/tuned/coll_tuned_reduce.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce.c b/ompi/mca/coll/tuned/coll_tuned_reduce.c index 349927d557..7588bbdc3d 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce.c @@ -623,7 +623,7 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, int i, rank, err, size; ptrdiff_t extent, dsize, gap; char *free_buffer = NULL, *pml_buffer = NULL; - char *inplace_temp = NULL, *inbuf; + char *inplace_temp_free = NULL, *inbuf; /* Initialize */ @@ -650,11 +650,11 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; - inplace_temp = (char*)malloc(dsize); - if (NULL == inplace_temp) { + inplace_temp_free = (char*)malloc(dsize); + if (NULL == inplace_temp_free) { return OMPI_ERR_OUT_OF_RESOURCE; } - rbuf = inplace_temp - gap; + rbuf = inplace_temp_free - gap; } if (size > 1) { @@ -700,15 +700,14 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, ompi_op_reduce(op, inbuf, rbuf, count, dtype); } - if (NULL != inplace_temp) { - err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, - inplace_temp); + if (NULL != inplace_temp_free) { + err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, rbuf); } err = MPI_SUCCESS; exit: - if (NULL != inplace_temp) { - free(inplace_temp); + if (NULL != inplace_temp_free) { + free(inplace_temp_free); } if (NULL != free_buffer) { free(free_buffer); From ca9352896442f8a30664953bcf021ef37f68b22a Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Tue, 5 Jul 2016 13:32:36 +0900 Subject: [PATCH 05/11] coll/basic: fix non standard ddt handling - correctly handle non zero lower bound ddt - correctly handle ddt with size > extent Thanks Yuki Matsumoto for the report (back-ported from commit open-mpi/ompi@488d037d51e3973a7db64b127940dd20792d3299) --- ompi/mca/coll/basic/coll_basic_allgather.c | 39 +++++++++---------- .../coll/basic/coll_basic_reduce_scatter.c | 31 ++++++++------- .../basic/coll_basic_reduce_scatter_block.c | 36 +++++++++-------- 3 files changed, 55 insertions(+), 51 deletions(-) diff --git a/ompi/mca/coll/basic/coll_basic_allgather.c b/ompi/mca/coll/basic/coll_basic_allgather.c index 06d58cfe69..3bd0560c5b 100644 --- a/ompi/mca/coll/basic/coll_basic_allgather.c +++ b/ompi/mca/coll/basic/coll_basic_allgather.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -91,9 +91,10 @@ mca_coll_basic_allgather_inter(void *sbuf, int scount, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int rank, root = 0, size, rsize, err, i; - char *tmpbuf = NULL, *ptmp; - ptrdiff_t rlb, slb, rextent, sextent, incr; + int rank, root = 0, size, rsize, i, err; + char *tmpbuf_free = NULL, *tmpbuf, *ptmp; + ptrdiff_t rlb, rextent, incr; + ptrdiff_t gap, span; ompi_request_t *req; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; ompi_request_t **reqs = basic_module->mccb_reqs; @@ -116,17 +117,13 @@ mca_coll_basic_allgather_inter(void *sbuf, int scount, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { - return err; + goto exit; } } else { /* receive a msg. from all other procs. */ err = ompi_datatype_get_extent(rdtype, &rlb, &rextent); if (OMPI_SUCCESS != err) { - return err; - } - err = ompi_datatype_get_extent(sdtype, &slb, &sextent); - if (OMPI_SUCCESS != err) { - return err; + goto exit; } /* Do a send-recv between the two root procs. to avoid deadlock */ @@ -135,14 +132,14 @@ mca_coll_basic_allgather_inter(void *sbuf, int scount, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[rsize])); if (OMPI_SUCCESS != err) { - return err; + goto exit; } err = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, &reqs[0])); if (OMPI_SUCCESS != err) { - return err; + goto exit; } incr = rextent * rcount; @@ -152,20 +149,22 @@ mca_coll_basic_allgather_inter(void *sbuf, int scount, MCA_COLL_BASE_TAG_ALLGATHER, comm, &reqs[i])); if (MPI_SUCCESS != err) { - return err; + goto exit; } } err = ompi_request_wait_all(rsize + 1, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { - return err; + goto exit; } - /* Step 2: exchange the resuts between the root processes */ - tmpbuf = (char *) malloc(scount * size * sextent); - if (NULL == tmpbuf) { - return err; + span = opal_datatype_span(&sdtype->super, scount * size, &gap); + tmpbuf_free = (char *) malloc(span); + if (NULL == tmpbuf_free) { + err = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; } + tmpbuf = tmpbuf_free - gap; err = MCA_PML_CALL(isend(rbuf, rsize * rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, @@ -222,8 +221,8 @@ mca_coll_basic_allgather_inter(void *sbuf, int scount, } exit: - if (NULL != tmpbuf) { - free(tmpbuf); + if (NULL != tmpbuf_free) { + free(tmpbuf_free); } return err; diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter.c index d64abda3b2..333567e4d1 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce_scatter.c +++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter.c @@ -14,7 +14,7 @@ * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -367,8 +367,9 @@ mca_coll_basic_reduce_scatter_inter(void *sbuf, void *rbuf, int *rcounts, { int err, i, rank, root = 0, rsize, lsize; int totalcounts; - ptrdiff_t lb, extent; + ptrdiff_t gap, span; char *tmpbuf = NULL, *tmpbuf2 = NULL; + char *lbuf, *buf; ompi_request_t *req; int *disps = NULL; @@ -399,10 +400,7 @@ mca_coll_basic_reduce_scatter_inter(void *sbuf, void *rbuf, int *rcounts, * its size is the same as the local communicator size. */ if (rank == root) { - err = ompi_datatype_get_extent(dtype, &lb, &extent); - if (OMPI_SUCCESS != err) { - return OMPI_ERROR; - } + span = opal_datatype_span(&dtype->super, totalcounts, &gap); /* Generate displacements for the scatterv part */ disps = (int*) malloc(sizeof(int) * lsize); @@ -414,12 +412,14 @@ mca_coll_basic_reduce_scatter_inter(void *sbuf, void *rbuf, int *rcounts, disps[i + 1] = disps[i] + rcounts[i]; } - tmpbuf = (char *) malloc(totalcounts * extent); - tmpbuf2 = (char *) malloc(totalcounts * extent); + tmpbuf = (char *) malloc(span); + tmpbuf2 = (char *) malloc(span); if (NULL == tmpbuf || NULL == tmpbuf2) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } + lbuf = tmpbuf - gap; + buf = tmpbuf2 - gap; /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(isend(sbuf, totalcounts, dtype, 0, @@ -429,7 +429,7 @@ mca_coll_basic_reduce_scatter_inter(void *sbuf, void *rbuf, int *rcounts, goto exit; } - err = MCA_PML_CALL(recv(tmpbuf2, totalcounts, dtype, 0, + err = MCA_PML_CALL(recv(lbuf, totalcounts, dtype, 0, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { @@ -443,11 +443,12 @@ mca_coll_basic_reduce_scatter_inter(void *sbuf, void *rbuf, int *rcounts, /* Loop receiving and calling reduction function (C or Fortran) - * The result of this reduction operations is then in - * tmpbuf2. + * The result of this reduction operations is then in + * lbuf. */ for (i = 1; i < rsize; i++) { - err = MCA_PML_CALL(recv(tmpbuf, totalcounts, dtype, i, + char *tbuf; + err = MCA_PML_CALL(recv(buf, totalcounts, dtype, i, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { @@ -455,7 +456,9 @@ mca_coll_basic_reduce_scatter_inter(void *sbuf, void *rbuf, int *rcounts, } /* Perform the reduction */ - ompi_op_reduce(op, tmpbuf, tmpbuf2, totalcounts, dtype); + ompi_op_reduce(op, lbuf, buf, totalcounts, dtype); + /* swap the buffers */ + tbuf = lbuf; lbuf = buf; buf = tbuf; } } else { /* If not root, send data to the root. */ @@ -468,7 +471,7 @@ mca_coll_basic_reduce_scatter_inter(void *sbuf, void *rbuf, int *rcounts, } /* Now do a scatterv on the local communicator */ - err = comm->c_local_comm->c_coll.coll_scatterv(tmpbuf2, rcounts, disps, dtype, + err = comm->c_local_comm->c_coll.coll_scatterv(lbuf, rcounts, disps, dtype, rbuf, rcounts[rank], dtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_scatterv_module); diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c index 9e6854bfb9..ce45bb4ac1 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c +++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c @@ -12,7 +12,7 @@ * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -58,7 +58,7 @@ mca_coll_basic_reduce_scatter_block_intra(void *sbuf, void *rbuf, int rcount, mca_coll_base_module_t *module) { int rank, size, count, err = OMPI_SUCCESS; - ptrdiff_t extent, buf_size, gap; + ptrdiff_t gap, span; char *recv_buf = NULL, *recv_buf_free = NULL; /* Initialize */ @@ -72,8 +72,7 @@ mca_coll_basic_reduce_scatter_block_intra(void *sbuf, void *rbuf, int rcount, } /* get datatype information */ - ompi_datatype_type_extent(dtype, &extent); - buf_size = opal_datatype_span(&dtype->super, count, &gap); + span = opal_datatype_span(&dtype->super, count, &gap); /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { @@ -83,12 +82,12 @@ mca_coll_basic_reduce_scatter_block_intra(void *sbuf, void *rbuf, int rcount, if (0 == rank) { /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ - recv_buf_free = (char*) malloc(buf_size); - recv_buf = recv_buf_free - gap; + recv_buf_free = (char*) malloc(span); if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } + recv_buf = recv_buf_free - gap; } /* reduction */ @@ -126,8 +125,9 @@ mca_coll_basic_reduce_scatter_block_inter(void *sbuf, void *rbuf, int rcount, { int err, i, rank, root = 0, rsize, lsize; int totalcounts; - ptrdiff_t lb, extent; + ptrdiff_t gap, span; char *tmpbuf = NULL, *tmpbuf2 = NULL; + char *lbuf, *buf; ompi_request_t *req; rank = ompi_comm_rank(comm); @@ -151,16 +151,15 @@ mca_coll_basic_reduce_scatter_block_inter(void *sbuf, void *rbuf, int rcount, * */ if (rank == root) { - err = ompi_datatype_get_extent(dtype, &lb, &extent); - if (OMPI_SUCCESS != err) { - return OMPI_ERROR; - } + span = opal_datatype_span(&dtype->super, totalcounts, &gap); - tmpbuf = (char *) malloc(totalcounts * extent); - tmpbuf2 = (char *) malloc(totalcounts * extent); + tmpbuf = (char *) malloc(span); + tmpbuf2 = (char *) malloc(span); if (NULL == tmpbuf || NULL == tmpbuf2) { return OMPI_ERR_OUT_OF_RESOURCE; } + lbuf = tmpbuf - gap; + buf = tmpbuf2 - gap; /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(isend(sbuf, totalcounts, dtype, 0, @@ -170,7 +169,7 @@ mca_coll_basic_reduce_scatter_block_inter(void *sbuf, void *rbuf, int rcount, goto exit; } - err = MCA_PML_CALL(recv(tmpbuf2, totalcounts, dtype, 0, + err = MCA_PML_CALL(recv(lbuf, totalcounts, dtype, 0, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { @@ -188,7 +187,8 @@ mca_coll_basic_reduce_scatter_block_inter(void *sbuf, void *rbuf, int rcount, * tmpbuf2. */ for (i = 1; i < rsize; i++) { - err = MCA_PML_CALL(recv(tmpbuf, totalcounts, dtype, i, + char *tbuf; + err = MCA_PML_CALL(recv(buf, totalcounts, dtype, i, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { @@ -196,7 +196,9 @@ mca_coll_basic_reduce_scatter_block_inter(void *sbuf, void *rbuf, int rcount, } /* Perform the reduction */ - ompi_op_reduce(op, tmpbuf, tmpbuf2, totalcounts, dtype); + ompi_op_reduce(op, lbuf, buf, totalcounts, dtype); + /* swap the buffers */ + tbuf = lbuf; lbuf = buf; buf = tbuf; } } else { /* If not root, send data to the root. */ @@ -209,7 +211,7 @@ mca_coll_basic_reduce_scatter_block_inter(void *sbuf, void *rbuf, int rcount, } /* Now do a scatterv on the local communicator */ - err = comm->c_local_comm->c_coll.coll_scatter(tmpbuf2, rcount, dtype, + err = comm->c_local_comm->c_coll.coll_scatter(lbuf, rcount, dtype, rbuf, rcount, dtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_scatter_module); From a7a9ae8787897e2029c4e65463131f609378a1f6 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Tue, 5 Jul 2016 13:36:56 +0900 Subject: [PATCH 06/11] coll/inter: fix non standard ddt handling - correctly handle non zero lower bound ddt - correctly handle ddt with size > extent Thanks Yuki Matsumoto for the report (back-ported from commit open-mpi/ompi@3e559a14a99ac737d501818eca6a2d1acfdbeb42) --- ompi/mca/coll/inter/coll_inter_allgather.c | 22 ++++++------- ompi/mca/coll/inter/coll_inter_allgatherv.c | 35 ++++++++------------ ompi/mca/coll/inter/coll_inter_allreduce.c | 19 ++++++----- ompi/mca/coll/inter/coll_inter_gather.c | 35 +++++++++----------- ompi/mca/coll/inter/coll_inter_gatherv.c | 31 +++++++----------- ompi/mca/coll/inter/coll_inter_reduce.c | 16 +++++---- ompi/mca/coll/inter/coll_inter_scatter.c | 28 ++++++++-------- ompi/mca/coll/inter/coll_inter_scatterv.c | 36 ++++++++------------- 8 files changed, 98 insertions(+), 124 deletions(-) diff --git a/ompi/mca/coll/inter/coll_inter_allgather.c b/ompi/mca/coll/inter/coll_inter_allgather.c index a8845d1ca1..4454b4534c 100644 --- a/ompi/mca/coll/inter/coll_inter_allgather.c +++ b/ompi/mca/coll/inter/coll_inter_allgather.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2010 University of Houston. All rights reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -47,8 +49,8 @@ mca_coll_inter_allgather_inter(void *sbuf, int scount, mca_coll_base_module_t *module) { int rank, root = 0, size, rsize, err; - char *ptmp = NULL; - ptrdiff_t slb, sextent, incr; + char *ptmp_free = NULL, *ptmp; + ptrdiff_t gap, span; ompi_request_t *req[2]; rank = ompi_comm_rank(comm); @@ -56,17 +58,13 @@ mca_coll_inter_allgather_inter(void *sbuf, int scount, rsize = ompi_comm_remote_size(comm); /* Perform the gather locally at the root */ - err = ompi_datatype_get_extent(sdtype, &slb, &sextent); - if (OMPI_SUCCESS != err) { - return OMPI_ERROR; - } - if ( scount > 0 ) { - incr = sextent * scount; - ptmp = (char*)malloc(size * incr); - if (NULL == ptmp) { + span = opal_datatype_span(&sdtype->super, scount*size, &gap); + ptmp_free = (char*)malloc(span); + if (NULL == ptmp_free) { return OMPI_ERR_OUT_OF_RESOURCE; } + ptmp = ptmp_free - gap; err = comm->c_local_comm->c_coll.coll_gather(sbuf, scount, sdtype, ptmp, scount, sdtype, @@ -110,8 +108,8 @@ mca_coll_inter_allgather_inter(void *sbuf, int scount, } exit: - if (NULL != ptmp) { - free(ptmp); + if (NULL != ptmp_free) { + free(ptmp_free); } return err; diff --git a/ompi/mca/coll/inter/coll_inter_allgatherv.c b/ompi/mca/coll/inter/coll_inter_allgatherv.c index aa4f6c1cd6..6e15c1befc 100644 --- a/ompi/mca/coll/inter/coll_inter_allgatherv.c +++ b/ompi/mca/coll/inter/coll_inter_allgatherv.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2010 University of Houston. All rights reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -47,10 +49,7 @@ mca_coll_inter_allgatherv_inter(void *sbuf, int scount, { int i, rank, size, size_local, total=0, err; int *count=NULL,*displace=NULL; - char *ptmp=NULL; - MPI_Aint incr; - MPI_Aint extent; - MPI_Aint lb; + char *ptmp_free=NULL, *ptmp; ompi_datatype_t *ndtype = NULL; ompi_request_t *req[2]; @@ -79,22 +78,19 @@ mca_coll_inter_allgatherv_inter(void *sbuf, int scount, for (i = 1; i < size_local; i++) { displace[i] = displace[i-1] + count[i-1]; } - /* Perform the gatherv locally with the first process as root */ - err = ompi_datatype_get_extent(sdtype, &lb, &extent); - if (OMPI_SUCCESS != err) { - err = OMPI_ERROR; - goto exit; - } - incr = 0; + total = 0; for (i = 0; i < size_local; i++) { - incr = incr + extent*count[i]; + total = total + count[i]; } - if ( incr > 0 ) { - ptmp = (char*)malloc(incr); - if (NULL == ptmp) { + if ( total > 0 ) { + ptrdiff_t gap, span; + span = opal_datatype_span(&sdtype->super, total, &gap); + ptmp_free = (char*)malloc(span); + if (NULL == ptmp_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } + ptmp = ptmp_free - gap; } } err = comm->c_local_comm->c_coll.coll_gatherv(sbuf, scount, sdtype, @@ -108,10 +104,7 @@ mca_coll_inter_allgatherv_inter(void *sbuf, int scount, ompi_datatype_create_indexed(size,rcounts,disps,rdtype,&ndtype); ompi_datatype_commit(&ndtype); - if (0 == rank) { - for (i = 0; i < size_local; i++) { - total = total + count[i]; - } + if (0 == rank) { /* Exchange data between roots */ err = MCA_PML_CALL(irecv(rbuf, 1, ndtype, 0, MCA_COLL_BASE_TAG_ALLGATHERV, comm, @@ -142,8 +135,8 @@ mca_coll_inter_allgatherv_inter(void *sbuf, int scount, if( NULL != ndtype ) { ompi_datatype_destroy(&ndtype); } - if (NULL != ptmp) { - free(ptmp); + if (NULL != ptmp_free) { + free(ptmp_free); } if (NULL != displace) { free(displace); diff --git a/ompi/mca/coll/inter/coll_inter_allreduce.c b/ompi/mca/coll/inter/coll_inter_allreduce.c index 405ff3ce58..cc5539f06d 100644 --- a/ompi/mca/coll/inter/coll_inter_allreduce.c +++ b/ompi/mca/coll/inter/coll_inter_allreduce.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 University of Houston. All rights reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -44,25 +46,22 @@ mca_coll_inter_allreduce_inter(void *sbuf, void *rbuf, int count, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int err, root = 0, rank; - ptrdiff_t lb, extent; + int err, rank, root = 0; char *tmpbuf = NULL, *pml_buffer = NULL; ompi_request_t *req[2]; + ptrdiff_t gap, span; rank = ompi_comm_rank(comm); /* Perform the reduction locally */ - err = ompi_datatype_get_extent(dtype, &lb, &extent); - if (OMPI_SUCCESS != err) { - return OMPI_ERROR; - } - - tmpbuf = (char *) malloc(count * extent); + span = opal_datatype_span(&dtype->super, count, &gap); + + tmpbuf = (char *) malloc(span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } - pml_buffer = tmpbuf - lb; - + pml_buffer = tmpbuf - gap; + err = comm->c_local_comm->c_coll.coll_reduce(sbuf, pml_buffer, count, dtype, op, root, comm->c_local_comm, diff --git a/ompi/mca/coll/inter/coll_inter_gather.c b/ompi/mca/coll/inter/coll_inter_gather.c index 958cf24351..177a1278aa 100644 --- a/ompi/mca/coll/inter/coll_inter_gather.c +++ b/ompi/mca/coll/inter/coll_inter_gather.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 University of Houston. All rights reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -45,11 +47,7 @@ mca_coll_inter_gather_inter(void *sbuf, int scount, { int err; int rank; - int size,size_local; - char *ptmp = NULL; - MPI_Aint incr; - MPI_Aint extent; - MPI_Aint lb; + int size; size = ompi_comm_remote_size(comm); rank = ompi_comm_rank(comm); @@ -59,20 +57,21 @@ mca_coll_inter_gather_inter(void *sbuf, int scount, err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* Perform the gather locally with the first process as root */ - err = ompi_datatype_get_extent(sdtype, &lb, &extent); - if (OMPI_SUCCESS != err) { - return OMPI_ERROR; - } - - incr = extent * scount; + char *ptmp_free = NULL, *ptmp; + int size_local; + ptrdiff_t gap, span; + size_local = ompi_comm_size(comm->c_local_comm); - ptmp = (char*)malloc(size_local * incr); - if (NULL == ptmp) { + span = opal_datatype_span(&sdtype->super, scount*size_local, &gap); + + ptmp_free = (char*)malloc(span); + if (NULL == ptmp_free) { return OMPI_ERR_OUT_OF_RESOURCE; } - - err = comm->c_local_comm->c_coll.coll_gather(sbuf, scount, sdtype, - ptmp, scount, sdtype, + ptmp = ptmp_free - gap; + + err = comm->c_local_comm->c_coll.coll_gather(sbuf, scount, sdtype, + ptmp, scount, sdtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_gather_module); if (0 == rank) { @@ -84,9 +83,7 @@ mca_coll_inter_gather_inter(void *sbuf, int scount, return err; } } - if (NULL != ptmp) { - free(ptmp); - } + free(ptmp_free); } else { /* I am the root, loop receiving the data. */ err = MCA_PML_CALL(recv(rbuf, rcount*size, rdtype, 0, diff --git a/ompi/mca/coll/inter/coll_inter_gatherv.c b/ompi/mca/coll/inter/coll_inter_gatherv.c index d2339e1634..9aa0720541 100644 --- a/ompi/mca/coll/inter/coll_inter_gatherv.c +++ b/ompi/mca/coll/inter/coll_inter_gatherv.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2010 University of Houston. All rights reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -44,10 +46,7 @@ mca_coll_inter_gatherv_inter(void *sbuf, int scount, { int i, rank, size, size_local, total=0, err; int *count=NULL, *displace=NULL; - char *ptmp=NULL; - MPI_Aint incr; - MPI_Aint extent; - MPI_Aint lb; + char *ptmp_free=NULL, *ptmp; ompi_datatype_t *ndtype; if (MPI_PROC_NULL == root) { /* do nothing */ @@ -90,21 +89,18 @@ mca_coll_inter_gatherv_inter(void *sbuf, int scount, displace[i] = displace[i-1] + count[i-1]; } /* Perform the gatherv locally with the first process as root */ - err = ompi_datatype_get_extent(sdtype, &lb, &extent); - if (OMPI_SUCCESS != err) { - err = OMPI_ERROR; - goto exit; - } - incr = 0; for (i = 0; i < size_local; i++) { - incr = incr + extent*count[i]; + total = total + count[i]; } - if ( incr > 0 ) { - ptmp = (char*)malloc(incr); - if (NULL == ptmp) { + if ( total > 0 ) { + ptrdiff_t gap, span; + span = opal_datatype_span(&sdtype->super, total, &gap); + ptmp_free = (char*)malloc(span); + if (NULL == ptmp_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } + ptmp = ptmp_free - gap; } } err = comm->c_local_comm->c_coll.coll_gatherv(sbuf, scount, sdtype, @@ -116,9 +112,6 @@ mca_coll_inter_gatherv_inter(void *sbuf, int scount, } if (0 == rank) { - for (i = 0; i < size_local; i++) { - total = total + count[i]; - } /* First process sends data to the root */ err = MCA_PML_CALL(send(ptmp, total, sdtype, root, MCA_COLL_BASE_TAG_GATHERV, @@ -126,8 +119,8 @@ mca_coll_inter_gatherv_inter(void *sbuf, int scount, } exit: - if (NULL != ptmp) { - free(ptmp); + if (NULL != ptmp_free) { + free(ptmp_free); } if (NULL != displace) { free(displace); diff --git a/ompi/mca/coll/inter/coll_inter_reduce.c b/ompi/mca/coll/inter/coll_inter_reduce.c index 68fe57c97d..26e4164e10 100644 --- a/ompi/mca/coll/inter/coll_inter_reduce.c +++ b/ompi/mca/coll/inter/coll_inter_reduce.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 University of Houston. All rights reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -44,9 +46,6 @@ mca_coll_inter_reduce_inter(void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { int rank, err; - ptrdiff_t true_lb, true_extent, lb, extent; - char *free_buffer = NULL; - char *pml_buffer = NULL; /* Initialize */ rank = ompi_comm_rank(comm); @@ -55,15 +54,18 @@ mca_coll_inter_reduce_inter(void *sbuf, void *rbuf, int count, /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { + ptrdiff_t gap, span; + char *free_buffer = NULL; + char *pml_buffer = NULL; + /* Perform the reduce locally with the first process as root */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + span = opal_datatype_span(&dtype->super, count, &gap); - free_buffer = (char*)malloc(true_extent + (count - 1) * extent); + free_buffer = (char*)malloc(span); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - pml_buffer = free_buffer - lb; + pml_buffer = free_buffer - gap; err = comm->c_local_comm->c_coll.coll_reduce(sbuf, pml_buffer, count, dtype, op, 0, comm->c_local_comm, diff --git a/ompi/mca/coll/inter/coll_inter_scatter.c b/ompi/mca/coll/inter/coll_inter_scatter.c index 1350a69ed9..190f013b54 100644 --- a/ompi/mca/coll/inter/coll_inter_scatter.c +++ b/ompi/mca/coll/inter/coll_inter_scatter.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2008 University of Houston. All rights reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -42,9 +44,7 @@ mca_coll_inter_scatter_inter(void *sbuf, int scount, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int rank, size, size_local, err; - char *ptmp = NULL; - ptrdiff_t lb, incr; + int rank, size, err; /* Initialize */ @@ -56,18 +56,18 @@ mca_coll_inter_scatter_inter(void *sbuf, int scount, err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* First process receives the data from root */ - if(0 == rank) { - err = ompi_datatype_get_extent(rdtype, &lb, &incr); - if (OMPI_SUCCESS != err) { - return OMPI_ERROR; - } - - incr *= rcount; + char *ptmp_free = NULL, *ptmp; + if(0 == rank) { + int size_local; + ptrdiff_t gap, span; + size_local = ompi_comm_size(comm->c_local_comm); - ptmp = (char*)malloc(size_local * incr); - if (NULL == ptmp) { + span = opal_datatype_span(&rdtype->super, rcount*size_local, &gap); + ptmp_free = malloc(span); + if (NULL == ptmp_free) { return OMPI_ERR_OUT_OF_RESOURCE; } + ptmp = ptmp_free - gap; err = MCA_PML_CALL(recv(ptmp, rcount*size_local, rdtype, root, MCA_COLL_BASE_TAG_SCATTER, @@ -81,8 +81,8 @@ mca_coll_inter_scatter_inter(void *sbuf, int scount, rbuf, rcount, rdtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_scatter_module); - if (NULL != ptmp) { - free(ptmp); + if (NULL != ptmp_free) { + free(ptmp_free); } } else { /* Root sends data to the first process in the remote group */ diff --git a/ompi/mca/coll/inter/coll_inter_scatterv.c b/ompi/mca/coll/inter/coll_inter_scatterv.c index 26ad961e6f..6eaa133959 100644 --- a/ompi/mca/coll/inter/coll_inter_scatterv.c +++ b/ompi/mca/coll/inter/coll_inter_scatterv.c @@ -9,7 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2010 University of Houston. All rights reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,12 +44,9 @@ mca_coll_inter_scatterv_inter(void *sbuf, int *scounts, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int i, rank, size, err, total, size_local; + int i, rank, size, err, total=0, size_local; int *counts=NULL,*displace=NULL; - char *ptmp=NULL; - MPI_Aint incr; - MPI_Aint extent; - MPI_Aint lb; + char *ptmp_free=NULL, *ptmp; ompi_datatype_t *ndtype; /* Initialize */ @@ -70,24 +68,18 @@ mca_coll_inter_scatterv_inter(void *sbuf, int *scounts, if (OMPI_SUCCESS != err) { return err; } - /* calculate the whole buffer size and recieve it from root */ - err = ompi_datatype_get_extent(rdtype, &lb, &extent); - if (OMPI_SUCCESS != err) { - return OMPI_ERROR; - } - incr = 0; + /* calculate the whole buffer size and receive it from root */ for (i = 0; i < size_local; i++) { - incr = incr + extent*counts[i]; + total = total + counts[i]; } - if ( incr > 0 ) { - ptmp = (char*)malloc(incr); - if (NULL == ptmp) { + if ( total > 0 ) { + ptrdiff_t gap, span; + span = opal_datatype_span(&rdtype->super, total, &gap); + ptmp_free = (char*)malloc(span); + if (NULL == ptmp_free) { return OMPI_ERR_OUT_OF_RESOURCE; } - } - total = 0; - for (i = 0; i < size_local; i++) { - total = total + counts[i]; + ptmp = ptmp_free - gap; } err = MCA_PML_CALL(recv(ptmp, total, rdtype, root, MCA_COLL_BASE_TAG_SCATTERV, @@ -111,8 +103,8 @@ mca_coll_inter_scatterv_inter(void *sbuf, int *scounts, return err; } - if (NULL != ptmp) { - free(ptmp); + if (NULL != ptmp_free) { + free(ptmp_free); } if (NULL != displace) { free(displace); From 4daa7417712c4d804623886b1ae97e4d0461c39c Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 6 Jul 2016 08:57:00 +0900 Subject: [PATCH 07/11] coll/libnbc: various fixes - correctly handle non commutative operators - correctly handle non zero lower bound ddt - correctly handle ddt with size > extent - revamp NBC_Sched_op so it takes two buffers and matches ompi_op_reduce semantic - various fix for inter communicators Thanks Yuki Matsumoto for the report (back-ported from commit open-mpi/ompi@678d08647b5943deff03bf2fb2cbe632d3e8275c) --- ompi/mca/coll/libnbc/Makefile.am | 4 +- ompi/mca/coll/libnbc/nbc.c | 111 +++++---- ompi/mca/coll/libnbc/nbc_iallreduce.c | 217 +++++++++++------ ompi/mca/coll/libnbc/nbc_iexscan.c | 55 ++--- ompi/mca/coll/libnbc/nbc_internal.h | 16 +- ompi/mca/coll/libnbc/nbc_ireduce.c | 220 ++++++++++++------ ompi/mca/coll/libnbc/nbc_ireduce_scatter.c | 151 +++++++----- ...er_block.c => nbc_ireduce_scatter_block.c} | 115 ++++----- ompi/mca/coll/libnbc/nbc_iscan.c | 35 ++- 9 files changed, 567 insertions(+), 357 deletions(-) rename ompi/mca/coll/libnbc/{coll_libnbc_ireduce_scatter_block.c => nbc_ireduce_scatter_block.c} (70%) diff --git a/ompi/mca/coll/libnbc/Makefile.am b/ompi/mca/coll/libnbc/Makefile.am index a39586a5ce..24b8d850a0 100644 --- a/ompi/mca/coll/libnbc/Makefile.am +++ b/ompi/mca/coll/libnbc/Makefile.am @@ -12,6 +12,8 @@ # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2013 Los Alamos National Security, LLC. All rights # reserved. +# Copyright (c) 2016 Research Organization for Information Science +# and Technology (RIST). All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -24,7 +26,6 @@ EXTRA_DIST = .windows sources = \ coll_libnbc.h \ coll_libnbc_component.c \ - coll_libnbc_ireduce_scatter_block.c \ nbc.c \ nbc_internal.h \ libdict/dict.h \ @@ -51,6 +52,7 @@ sources = \ nbc_ineighbor_alltoallw.c \ nbc_ireduce.c \ nbc_ireduce_scatter.c \ + nbc_ireduce_scatter_block.c \ nbc_iscan.c \ nbc_iscatter.c \ nbc_iscatterv.c \ diff --git a/ompi/mca/coll/libnbc/nbc.c b/ompi/mca/coll/libnbc/nbc.c index 1ecef4c264..4a3f5222b3 100644 --- a/ompi/mca/coll/libnbc/nbc.c +++ b/ompi/mca/coll/libnbc/nbc.c @@ -7,6 +7,10 @@ * reserved. * Copyright (c) 2006 The Technical University of Chemnitz. All * rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * * Author(s): Torsten Hoefler * @@ -50,9 +54,9 @@ int NBC_Sched_create(NBC_Schedule* schedule) { } /* this function puts a send into the schedule */ -int NBC_Sched_send(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule) { +static int NBC_Sched_send_internal (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, bool local, NBC_Schedule *schedule) { int size; - char* ptr; + char *ptr; NBC_Fn_type type = SEND; NBC_Args_send send_args; @@ -63,11 +67,12 @@ int NBC_Sched_send(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } /* store the passed arguments */ - send_args.buf=buf; - send_args.tmpbuf=tmpbuf; - send_args.count=count; - send_args.datatype=datatype; - send_args.dest=dest; + send_args.buf = buf; + send_args.tmpbuf = tmpbuf; + send_args.count = count; + send_args.datatype = datatype; + send_args.dest = dest; + send_args.local = local; /* append to the round-schedule */ ptr = (char*)*schedule + size; @@ -84,10 +89,18 @@ int NBC_Sched_send(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int return NBC_OK; } +int NBC_Sched_send (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule) { + return NBC_Sched_send_internal (buf, tmpbuf, count, datatype, dest, false, schedule); +} + +int NBC_Sched_local_send (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule) { + return NBC_Sched_send_internal (buf, tmpbuf, count, datatype, dest, true, schedule); +} + /* this function puts a receive into the schedule */ -int NBC_Sched_recv(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule) { +static int NBC_Sched_recv_internal (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, bool local, NBC_Schedule *schedule) { int size; - char* ptr; + char *ptr; NBC_Fn_type type = RECV; NBC_Args_recv recv_args; @@ -98,11 +111,12 @@ int NBC_Sched_recv(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } /* store the passed arguments */ - recv_args.buf=buf; - recv_args.tmpbuf=tmpbuf; - recv_args.count=count; - recv_args.datatype=datatype; - recv_args.source=source; + recv_args.buf = buf; + recv_args.tmpbuf = tmpbuf; + recv_args.count = count; + recv_args.datatype = datatype; + recv_args.source = source; + recv_args.local = local; /* append to the round-schedule */ ptr = (char*)*schedule + size; @@ -119,10 +133,19 @@ int NBC_Sched_recv(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int return NBC_OK; } +int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule) { + return NBC_Sched_recv_internal(buf, tmpbuf, count, datatype, source, false, schedule); +} + +int NBC_Sched_local_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule) { + return NBC_Sched_recv_internal(buf, tmpbuf, count, datatype, source, true, schedule); +} + /* this function puts an operation into the schedule */ -int NBC_Sched_op(void *buf3, char tmpbuf3, void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule) { +int NBC_Sched_op (void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, + MPI_Op op, NBC_Schedule *schedule) { int size; - char* ptr; + char *ptr; NBC_Fn_type type = OP; NBC_Args_op op_args; @@ -133,15 +156,13 @@ int NBC_Sched_op(void *buf3, char tmpbuf3, void* buf1, char tmpbuf1, void* buf2, if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } /* store the passed arguments */ - op_args.buf1=buf1; - op_args.buf2=buf2; - op_args.buf3=buf3; - op_args.tmpbuf1=tmpbuf1; - op_args.tmpbuf2=tmpbuf2; - op_args.tmpbuf3=tmpbuf3; - op_args.count=count; - op_args.op=op; - op_args.datatype=datatype; + op_args.buf1 = buf1; + op_args.buf2 = buf2; + op_args.tmpbuf1 = tmpbuf1; + op_args.tmpbuf2 = tmpbuf2; + op_args.count = count; + op_args.op = op; + op_args.datatype = datatype; /* append to the round-schedule */ ptr = (char*)*schedule + size; @@ -379,13 +400,13 @@ static inline int NBC_Start_round(NBC_Handle *handle) { int i, res, ret=NBC_OK; char* ptr; NBC_Fn_type type; - NBC_Args_send sendargs; - NBC_Args_recv recvargs; - NBC_Args_op opargs; - NBC_Args_copy copyargs; - NBC_Args_unpack unpackargs; + NBC_Args_send sendargs; + NBC_Args_recv recvargs; + NBC_Args_op opargs; + NBC_Args_copy copyargs; + NBC_Args_unpack unpackargs; NBC_Schedule myschedule; - void *buf1, *buf2, *buf3; + void *buf1, *buf2; /* get round-schedule address */ myschedule = (NBC_Schedule*)((char*)*handle->schedule + handle->row_offset); @@ -412,10 +433,12 @@ static inline int NBC_Start_round(NBC_Handle *handle) { #ifdef NBC_TIMING Isend_time -= MPI_Wtime(); #endif - handle->req_array = (MPI_Request*)realloc((void*)handle->req_array, (handle->req_count)*sizeof(MPI_Request)); + handle->req_array = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request)); NBC_CHECK_NULL(handle->req_array); - res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag, MCA_PML_BASE_SEND_STANDARD, handle->comm, handle->req_array+handle->req_count-1)); - if(OMPI_SUCCESS != res) { printf("Error in MPI_Isend(%lu, %i, %lu, %i, %i, %lu) (%i)\n", (unsigned long)buf1, sendargs.count, (unsigned long)sendargs.datatype, sendargs.dest, handle->tag, (unsigned long)handle->comm, res); ret=res; goto error; } + res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag, + MCA_PML_BASE_SEND_STANDARD, sendargs.local?handle->comm->c_local_comm:handle->comm, + handle->req_array+handle->req_count - 1)); + if (OMPI_SUCCESS != res) { printf("Error in MPI_Isend(%lu, %i, %lu, %i, %i, %lu) (%i)", (unsigned long)buf1, sendargs.count, (unsigned long)sendargs.datatype, sendargs.dest, handle->tag, (unsigned long)handle->comm, res); ret=res; goto error; } #ifdef NBC_TIMING Isend_time += MPI_Wtime(); #endif @@ -435,18 +458,21 @@ static inline int NBC_Start_round(NBC_Handle *handle) { #ifdef NBC_TIMING Irecv_time -= MPI_Wtime(); #endif - handle->req_array = (MPI_Request*)realloc((void*)handle->req_array, (handle->req_count)*sizeof(MPI_Request)); + handle->req_array = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request)); NBC_CHECK_NULL(handle->req_array); - res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, handle->comm, handle->req_array+handle->req_count-1)); - if(OMPI_SUCCESS != res) { printf("Error in MPI_Irecv(%lu, %i, %lu, %i, %i, %lu) (%i)\n", (unsigned long)buf1, recvargs.count, (unsigned long)recvargs.datatype, recvargs.source, handle->tag, (unsigned long)handle->comm, res); ret=res; goto error; } + + res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, recvargs.local?handle->comm->c_local_comm:handle->comm, + handle->req_array+handle->req_count-1)); + if (OMPI_SUCCESS != res) { printf("Error in MPI_Irecv(%lu, %i, %lu, %i, %i, %lu) (%i)", (unsigned long)buf1, recvargs.count, (unsigned long)recvargs.datatype, recvargs.source, handle->tag, (unsigned long)handle->comm, res); ret=res; goto error; } #ifdef NBC_TIMING Irecv_time += MPI_Wtime(); #endif break; case OP: - NBC_DEBUG(5, " OP (offset %li) ", (long)ptr-(long)myschedule); + NBC_DEBUG(5, " OP2 (offset %li) ", (long)ptr - (long)myschedule); NBC_GET_BYTES(ptr,opargs); - NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %lu)\n", opargs.buf1, opargs.buf2, opargs.count, (unsigned long)opargs.datatype); + NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2, + opargs.count, opargs.datatype); /* get buffers */ if(opargs.tmpbuf1) { buf1=(char*)handle->tmpbuf+(long)opargs.buf1; @@ -458,12 +484,7 @@ static inline int NBC_Start_round(NBC_Handle *handle) { } else { buf2=opargs.buf2; } - if(opargs.tmpbuf3) { - buf3=(char*)handle->tmpbuf+(long)opargs.buf3; - } else { - buf3=opargs.buf3; - } - ompi_3buff_op_reduce(opargs.op, buf1, buf2, buf3, opargs.count, opargs.datatype); + ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype); break; case COPY: NBC_DEBUG(5, " COPY (offset %li) ", (long)ptr-(long)myschedule); diff --git a/ompi/mca/coll/libnbc/nbc_iallreduce.c b/ompi/mca/coll/libnbc/nbc_iallreduce.c index 6455fbac30..e9c37568d9 100644 --- a/ompi/mca/coll/libnbc/nbc_iallreduce.c +++ b/ompi/mca/coll/libnbc/nbc_iallreduce.c @@ -6,6 +6,8 @@ * rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2014-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * * Author(s): Torsten Hoefler * @@ -13,12 +15,18 @@ #include "nbc_internal.h" #include "ompi/communicator/communicator.h" #include "ompi/datatype/ompi_datatype.h" +#include "ompi/op/op.h" #include -static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, void *sendbuf, void *recvbuf, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle); -static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, void *sendbuf, void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule, NBC_Handle *handle); -static inline int allred_sched_linear(int rank, int p, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle); +static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, ptrdiff_t gap, void *sendbuf, + void *recvbuf, MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle); +static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, void *sendbuf, + void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule, + NBC_Handle *handle); +static inline int allred_sched_linear(int rank, int p, void *sendbuf, void *recvbuf, int count, + MPI_Datatype datatype, ptrdiff_t gap, MPI_Op op, int ext, int size, + NBC_Schedule *schedule, NBC_Handle *handle); #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ @@ -54,6 +62,7 @@ int ompi_coll_libnbc_iallreduce(void* sendbuf, void* recvbuf, int count, MPI_Dat NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; NBC_IN_PLACE(sendbuf, recvbuf, inplace); @@ -68,8 +77,9 @@ int ompi_coll_libnbc_iallreduce(void* sendbuf, void* recvbuf, int count, MPI_Dat res = ompi_datatype_type_size (datatype, &size); if (OMPI_SUCCESS != res) { printf("MPI Error in MPI_Type_size() (%i)\n", res); return res; } - handle->tmpbuf = malloc(ext*count); - if(handle->tmpbuf == NULL) { printf("Error in malloc() (%i)\n", res); return NBC_OOR; } + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span); + if (NULL == handle->tmpbuf) { printf("Error in malloc() (%i)\n", res); return NBC_OOR; } if((p == 1) && !inplace) { /* for a single node - copy data to receivebuf */ @@ -78,7 +88,7 @@ int ompi_coll_libnbc_iallreduce(void* sendbuf, void* recvbuf, int count, MPI_Dat } /* algorithm selection */ - if(p < 4 || size*count < 65536 || inplace) { + if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) { alg = NBC_ARED_BINOMIAL; } else { alg = NBC_ARED_RING; @@ -102,7 +112,7 @@ int ompi_coll_libnbc_iallreduce(void* sendbuf, void* recvbuf, int count, MPI_Dat switch(alg) { case NBC_ARED_BINOMIAL: - res = allred_sched_diss(rank, p, count, datatype, sendbuf, recvbuf, op, schedule, handle); + res = allred_sched_diss(rank, p, count, datatype, gap, sendbuf, recvbuf, op, inplace, schedule, handle); break; case NBC_ARED_RING: res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, handle); @@ -151,6 +161,7 @@ int ompi_coll_libnbc_iallreduce_inter(void* sendbuf, void* recvbuf, int count, M NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } @@ -164,7 +175,8 @@ int ompi_coll_libnbc_iallreduce_inter(void* sendbuf, void* recvbuf, int count, M res = MPI_Type_size(datatype, &size); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_size() (%i)\n", res); return res; } - handle->tmpbuf = malloc(ext*count); + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span); if(handle->tmpbuf == NULL) { printf("Error in malloc() (%i)\n", res); return NBC_OOR; } schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); @@ -173,7 +185,8 @@ int ompi_coll_libnbc_iallreduce_inter(void* sendbuf, void* recvbuf, int count, M res = NBC_Sched_create(schedule); if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } - res = allred_sched_linear(rank, rsize, sendbuf, recvbuf, count, datatype, op, ext, size, schedule, handle); + res = allred_sched_linear (rank, rsize, sendbuf, recvbuf, count, datatype, gap, op, + ext, size, schedule, handle); if (NBC_OK != res) { printf("Error in Schedule creation() (%i)\n", res); return res; } res = NBC_Sched_commit(schedule); @@ -222,12 +235,31 @@ int ompi_coll_libnbc_iallreduce_inter(void* sendbuf, void* recvbuf, int count, M if (vrank == 0) rank = root; \ if (vrank == root) rank = 0; \ } -static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, void *sendbuf, void *recvbuf, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { +static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, ptrdiff_t gap, void *sendbuf, void *recvbuf, + MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle) { int root, vrank, r, maxr, firstred, vpeer, peer, res; + char *rbuf, *lbuf, *buf; + int tmprbuf, tmplbuf; root = 0; /* this makes the code for ireduce and iallreduce nearly identical - could be changed to improve performance */ RANK2VRANK(rank, vrank, root); - maxr = (int)ceil((log(p)/LOG2)); + maxr = (int)ceil((log((double)p)/LOG2)); + /* ensure the result ends up in recvbuf on vrank 0 */ + if (0 == (maxr%2)) { + rbuf = (void *)(-gap); + tmprbuf = true; + lbuf = recvbuf; + tmplbuf = false; + } else { + lbuf = (void *)(-gap); + tmplbuf = true; + rbuf = recvbuf; + tmprbuf = false; + if (inplace) { + res = NBC_Copy(rbuf, count, datatype, ((char *)handle->tmpbuf) - gap, count, datatype, MPI_COMM_SELF); + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Copy() (%i)\n", res); return res; } + } + } firstred = 1; for(r=1; r<=maxr; r++) { @@ -236,34 +268,39 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat vpeer = vrank + (1<<(r-1)); VRANK2RANK(peer, vpeer, root) if(peertmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } /* we have to wait until we have the data */ res = NBC_Sched_barrier(schedule); if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } - if(firstred && MPI_IN_PLACE != sendbuf) { + /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ + if (firstred && !inplace) { /* perform the reduce with the senbuf */ - res = NBC_Sched_op(recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule); + res = NBC_Sched_op (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule); firstred = 0; } else { /* perform the reduce in my local buffer */ - res = NBC_Sched_op(recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule); + res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule); } if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } /* this cannot be done until handle->tmpbuf is unused :-( */ res = NBC_Sched_barrier(schedule); if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; + tmprbuf ^= 1; tmplbuf ^= 1; } } else { /* we have to send this round */ vpeer = vrank - (1<<(r-1)); VRANK2RANK(peer, vpeer, root) - if(firstred && MPI_IN_PLACE != sendbuf) { + if (firstred && !inplace) { /* we have to use the sendbuf in the first round .. */ res = NBC_Sched_send(sendbuf, false, count, datatype, peer, schedule); } else { - /* and the recvbuf in all remeining rounds */ - res = NBC_Sched_send(recvbuf, false, count, datatype, peer, schedule); + /* and the recvbuf in all remaining rounds */ + res = NBC_Sched_send (lbuf, tmplbuf, count, datatype, peer, schedule); } if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } /* leave the game */ @@ -288,6 +325,7 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat if(NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } } + if (0 == vrank) assert(lbuf == recvbuf); /* now send to the right hosts */ for(r=0; rtmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); break; } + + res = NBC_Sched_recv ((char *) recvbuf + roffset, false, segsizes[relement], datatype, rpeer, schedule); + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); break; } + + res = NBC_Sched_barrier(schedule); + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); break; } + + res = NBC_Sched_op ((char *) sendbuf + roffset, false, (char *) recvbuf + roffset, false, + segsizes[relement], datatype, op, schedule); + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); break; } + + res = NBC_Sched_barrier(schedule); + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); break; } } - return NBC_OK; + if (NBC_OK != res) { + free (segsizes); + free (segoffsets); + return res; + } + + for (int round = p - 1 ; round < 2 * p - 2 ; ++round) { + int selement = (r+1-round + 2*p /*2*p avoids negative mod*/)%p; /* the element I am sending */ + int soffset = segoffsets[selement]*ext; + int relement = (r-round + 2*p /*2*p avoids negative mod*/)%p; /* the element that I receive from my neighbor */ + int roffset = segoffsets[relement]*ext; + + res = NBC_Sched_send ((char *) recvbuf + soffset, false, segsizes[selement], datatype, speer, schedule); + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); break; } + + res = NBC_Sched_recv ((char *) recvbuf + roffset, false, segsizes[relement], datatype, rpeer, schedule); + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); break; } + + res = NBC_Sched_barrier(schedule); + if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); break; } + } + + free (segsizes); + free (segoffsets); + + return res; } static inline int allred_sched_linear(int rank, int rsize, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, - MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle) { - int res, rpeer; + ptrdiff_t gap, MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle) { + int res; if(count == 0) return NBC_OK; @@ -480,31 +535,59 @@ static inline int allred_sched_linear(int rank, int rsize, void *sendbuf, void * res = NBC_Sched_send (sendbuf, false, count, datatype, 0, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } - res = NBC_Sched_recv (recvbuf, false, count, datatype, 0, schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + /* recv my data to the remote root */ + if (0 != rank || 1 ==(rsize%2)) { + res = NBC_Sched_recv (recvbuf, false, count, datatype, 0, schedule); + } else { + res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, 0, schedule); + } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } if (0 == rank) { - /* wait for data from the remote root */ + char *rbuf, *lbuf, *buf; + int tmprbuf, tmplbuf; + res = NBC_Sched_barrier (schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + /* ensure the result ends up in recvbuf */ + if (0 == (rsize%2)) { + lbuf = (void *)(-gap); + tmplbuf = true; + rbuf = recvbuf; + tmprbuf = false; + } else { + rbuf = (void *)(-gap); + tmprbuf = true; + lbuf = recvbuf; + tmplbuf = false; + } + /* get data from remote peers and reduce */ - for (rpeer = 1 ; rpeer < rsize ; ++rpeer) { - res = NBC_Sched_recv (0, true, count, datatype, rpeer, schedule); + for (int rpeer = 1 ; rpeer < rsize ; ++rpeer) { + res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, rpeer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } - res = NBC_Sched_op (recvbuf, false, 0, true, recvbuf, false, count, datatype, op, schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_op() (%i)\n", res); return res; } + res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; + tmprbuf ^= 1; tmplbuf ^= 1; } /* exchange our result with the remote root (each root will broadcast to the other's peers) */ - res = NBC_Sched_recv (0, true, count, datatype, 0, schedule); + res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, 0, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } res = NBC_Sched_send (recvbuf, false, count, datatype, 0, schedule); @@ -515,8 +598,8 @@ static inline int allred_sched_linear(int rank, int rsize, void *sendbuf, void * if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } /* broadcast the result to all remote peers */ - for (rpeer = 1 ; rpeer < rsize ; ++rpeer) { - res = NBC_Sched_send (0, true, count, datatype, rpeer, schedule); + for (int rpeer = 1 ; rpeer < rsize ; ++rpeer) { + res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rpeer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } } diff --git a/ompi/mca/coll/libnbc/nbc_iexscan.c b/ompi/mca/coll/libnbc/nbc_iexscan.c index b7c6a4aa77..90383a6aeb 100644 --- a/ompi/mca/coll/libnbc/nbc_iexscan.c +++ b/ompi/mca/coll/libnbc/nbc_iexscan.c @@ -42,7 +42,7 @@ int ompi_coll_libnbc_iexscan(void* sendbuf, void* recvbuf, int count, MPI_Dataty struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_0_0_t *module) { int rank, p, res; - MPI_Aint ext; + ptrdiff_t gap, span; NBC_Schedule *schedule; #ifdef NBC_CACHE_SCHEDULE NBC_Scan_args *args, *found, search; @@ -54,25 +54,24 @@ int ompi_coll_libnbc_iexscan(void* sendbuf, void* recvbuf, int count, MPI_Dataty NBC_IN_PLACE(sendbuf, recvbuf, inplace); + rank = ompi_comm_rank (comm); + p = ompi_comm_size (comm); + res = NBC_Init_handle(comm, coll_req, libnbc_module); if (res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); - res = MPI_Comm_rank(comm, &rank); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } - res = MPI_Comm_size(comm, &p); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } - res = MPI_Type_extent(datatype, &ext); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } - - if (inplace && rank < p - 1) - /* need more buffer space for the inplace case */ - handle->tmpbuf = malloc(ext * count * 2); - else - handle->tmpbuf = malloc(ext * count); - - if (handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } + span = opal_datatype_span(&datatype->super, count, &gap); + if (0 < rank) { + handle->tmpbuf = malloc(span); + if (handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } + if (inplace) { + NBC_Copy(recvbuf, count, datatype, (char *)handle->tmpbuf-gap, count, datatype, comm); + } else { + NBC_Copy(sendbuf, count, datatype, (char *)handle->tmpbuf-gap, count, datatype, comm); + } + } #ifdef NBC_CACHE_SCHEDULE fprintf (stderr, "NBC_CACHE_SCHEDULE\n"); @@ -92,12 +91,7 @@ int ompi_coll_libnbc_iexscan(void* sendbuf, void* recvbuf, int count, MPI_Dataty if (res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } if (rank != 0) { - if (inplace && rank < p - 1) - /* if sendbuf == recvbuf do not clobber the send buffer until it has been combined - * with the incoming data. */ - res = NBC_Sched_recv((void *)(ext * count), true, count, datatype, rank-1, schedule); - else - res = NBC_Sched_recv(recvbuf, false, count, datatype, rank-1, schedule); + res = NBC_Sched_recv (recvbuf, false, count, datatype, rank-1, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } @@ -106,27 +100,22 @@ int ompi_coll_libnbc_iexscan(void* sendbuf, void* recvbuf, int count, MPI_Dataty res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } - /* perform the reduce in my temporary buffer */ - if (inplace) - res = NBC_Sched_op(0, true, sendbuf, false, (void *)(ext * count), true, count, datatype, op, schedule); - else - res = NBC_Sched_op(0, true, sendbuf, false, recvbuf, false, count, datatype, op, schedule); + res = NBC_Sched_op (recvbuf, false, (void *)(-gap), true, count, datatype, op, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } - /* this cannot be done until handle->tmpbuf is unused :-( */ res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } /* send reduced data onward */ - res = NBC_Sched_send(0, true, count, datatype, rank + 1, schedule); + res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rank + 1, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } - - if (inplace) - /* copy the received data into the receive buffer */ - NBC_Sched_copy ((void *)(ext * count), true, count, datatype, recvbuf, false, count, datatype, schedule); } } else if (p > 1) { - res = NBC_Sched_send(sendbuf, false, count, datatype, 1, schedule); + if (inplace) { + res = NBC_Sched_send (recvbuf, false, count, datatype, 1, schedule); + } else { + res = NBC_Sched_send (sendbuf, false, count, datatype, 1, schedule); + } if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } diff --git a/ompi/mca/coll/libnbc/nbc_internal.h b/ompi/mca/coll/libnbc/nbc_internal.h index 81be8ccff7..18f56591be 100644 --- a/ompi/mca/coll/libnbc/nbc_internal.h +++ b/ompi/mca/coll/libnbc/nbc_internal.h @@ -9,6 +9,10 @@ * * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * */ #ifndef __NBC_INTERNAL_H__ @@ -28,6 +32,7 @@ #include "ompi/include/ompi/constants.h" #include "ompi/request/request.h" #include "ompi/datatype/ompi_datatype.h" +#include "ompi/communicator/communicator.h" #include #include @@ -85,6 +90,7 @@ typedef struct { int count; MPI_Datatype datatype; int dest; + bool local; } NBC_Args_send; /* the receive argument struct */ @@ -94,6 +100,7 @@ typedef struct { int count; MPI_Datatype datatype; int source; + bool local; } NBC_Args_recv; /* the operation argument struct */ @@ -102,11 +109,9 @@ typedef struct { char tmpbuf1; void *buf2; char tmpbuf2; - void *buf3; - char tmpbuf3; - int count; MPI_Op op; MPI_Datatype datatype; + int count; } NBC_Args_op; /* the copy argument struct */ @@ -134,8 +139,11 @@ typedef struct { /* internal function prototypes */ int NBC_Sched_create(NBC_Schedule* schedule); int NBC_Sched_send(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule); +int NBC_Sched_local_send (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest,NBC_Schedule *schedule); int NBC_Sched_recv(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule); -int NBC_Sched_op(void* buf3, char tmpbuf3, void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule); +int NBC_Sched_local_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule); +int NBC_Sched_op (void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, + MPI_Op op, NBC_Schedule *schedule); int NBC_Sched_copy(void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount, MPI_Datatype tgttype, NBC_Schedule *schedule); int NBC_Sched_unpack(void *inbuf, char tmpinbuf, int count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf, NBC_Schedule *schedule); int NBC_Sched_barrier(NBC_Schedule *schedule); diff --git a/ompi/mca/coll/libnbc/nbc_ireduce.c b/ompi/mca/coll/libnbc/nbc_ireduce.c index 78cbf43978..d0bbf4bd32 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce.c @@ -6,16 +6,24 @@ * rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2014-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * * Author(s): Torsten Hoefler * */ + +#include "ompi/op/op.h" + #include "nbc_internal.h" -static inline int red_sched_binomial(int rank, int p, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, void *redbuf, NBC_Schedule *schedule, NBC_Handle *handle); -static inline int red_sched_chain(int rank, int p, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize); +static inline int red_sched_binomial (int rank, int p, int root, void *sendbuf, void *redbuf, int count, MPI_Datatype datatype, + MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle); +static inline int red_sched_chain (int rank, int p, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, + MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize); -static inline int red_sched_linear(int rank, int rsize, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle); +static inline int red_sched_linear (int rank, int rsize, int root, void *sendbuf, void *recvbuf, void *tmpbuf, int count, MPI_Datatype datatype, + MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle); #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ @@ -50,16 +58,16 @@ int ompi_coll_libnbc_ireduce(void* sendbuf, void* recvbuf, int count, MPI_Dataty NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; - + ptrdiff_t span, gap; + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + rank = ompi_comm_rank (comm); + p = ompi_comm_size (comm); + res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); - res = MPI_Comm_rank(comm, &rank); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } - res = MPI_Comm_size(comm, &p); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } res = MPI_Type_extent(datatype, &ext); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } res = MPI_Type_size(datatype, &size); @@ -70,20 +78,23 @@ int ompi_coll_libnbc_ireduce(void* sendbuf, void* recvbuf, int count, MPI_Dataty res = NBC_Copy(sendbuf, count, datatype, recvbuf, count, datatype, comm); if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } } - + + span = opal_datatype_span(&datatype->super, count, &gap); + /* algorithm selection */ - if(p > 4 || size*count < 65536) { + if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) { alg = NBC_RED_BINOMIAL; if(rank == root) { /* root reduces in receivebuffer */ - handle->tmpbuf = malloc(ext*count); + handle->tmpbuf = malloc (span); + redbuf = recvbuf; } else { /* recvbuf may not be valid on non-root nodes */ - handle->tmpbuf = malloc(ext*count*2); - redbuf = ((char*)handle->tmpbuf)+(ext*count); + handle->tmpbuf = malloc (2*span); + redbuf = (char*) handle->tmpbuf + span - gap; } } else { - handle->tmpbuf = malloc(ext*count); + handle->tmpbuf = malloc (span); alg = NBC_RED_CHAIN; segsize = 16384/2; } @@ -108,7 +119,7 @@ int ompi_coll_libnbc_ireduce(void* sendbuf, void* recvbuf, int count, MPI_Dataty switch(alg) { case NBC_RED_BINOMIAL: - res = red_sched_binomial(rank, p, root, sendbuf, recvbuf, count, datatype, op, redbuf, schedule, handle); + res = red_sched_binomial(rank, p, root, sendbuf, redbuf, count, datatype, op, inplace, schedule, handle); break; case NBC_RED_CHAIN: res = red_sched_chain(rank, p, root, sendbuf, recvbuf, count, datatype, op, ext, size, schedule, handle, segsize); @@ -152,23 +163,20 @@ int ompi_coll_libnbc_ireduce_inter(void* sendbuf, void* recvbuf, int count, MPI_ struct mca_coll_base_module_2_0_0_t *module) { int rank, res, rsize; NBC_Schedule *schedule; - MPI_Aint ext; NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; + + rank = ompi_comm_rank (comm); + rsize = ompi_comm_remote_size (comm); res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); - res = MPI_Comm_rank(comm, &rank); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } - res = MPI_Comm_remote_size(comm, &rsize); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_remote_size() (%i)\n", res); return res; } - res = MPI_Type_extent(datatype, &ext); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } - - handle->tmpbuf = malloc(ext*count); + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span); if (NULL == handle->tmpbuf) { printf("Error in malloc() (%i)\n", res); return res; } schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); @@ -177,7 +185,7 @@ int ompi_coll_libnbc_ireduce_inter(void* sendbuf, void* recvbuf, int count, MPI_ res = NBC_Sched_create(schedule); if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } - res = red_sched_linear (rank, rsize, root, sendbuf, recvbuf, count, datatype, op, schedule, handle); + res = red_sched_linear (rank, rsize, root, sendbuf, recvbuf, (void *)(-gap), count, datatype, op, schedule, handle); if (NBC_OK != res) { printf("Error in Schedule creation() (%i)\n", res); return res; } res = NBC_Sched_commit(schedule); @@ -192,6 +200,8 @@ int ompi_coll_libnbc_ireduce_inter(void* sendbuf, void* recvbuf, int count, MPI_ /* binomial reduce + * if op is not commutative, reduce on rank 0, and then send the result to root rank + * * working principle: * - each node gets a virtual rank vrank * - the 'root' node get vrank 0 @@ -220,72 +230,103 @@ int ompi_coll_libnbc_ireduce_inter(void* sendbuf, void* recvbuf, int count, MPI_ if (vrank == 0) rank = root; \ if (vrank == root) rank = 0; \ } -static inline int red_sched_binomial(int rank, int p, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, void *redbuf, NBC_Schedule *schedule, NBC_Handle *handle) { - int firstred, vrank, vpeer, peer, res, maxr, r; - - RANK2VRANK(rank, vrank, root); - maxr = (int)ceil((log(p)/LOG2)); +static inline int red_sched_binomial (int rank, int p, int root, void *sendbuf, void *redbuf, int count, MPI_Datatype datatype, + MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle) { + int firstred, vroot, vrank, vpeer, peer, res, maxr, r; + char *rbuf, *lbuf, *buf; + int tmprbuf, tmplbuf; + ptrdiff_t gap; + (void)opal_datatype_span(&datatype->super, count, &gap); + + if (ompi_op_is_commute(op)) { + vroot = root; + } else { + vroot = 0; + } + RANK2VRANK(rank, vrank, vroot); + maxr = (int)ceil((log((double)p)/LOG2)); + + /* ensure the result ends up in redbuf on vrank 0 */ + if (0 == (maxr%2)) { + rbuf = (void *)(-gap); + tmprbuf = true; + lbuf = redbuf; + tmplbuf = false; + } else { + lbuf = (void *)(-gap); + tmplbuf = true; + rbuf = redbuf; + tmprbuf = false; + if (inplace) { + res = NBC_Copy(rbuf, count, datatype, ((char *)handle->tmpbuf)-gap, count, datatype, MPI_COMM_SELF); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + } + } firstred = 1; - for(r=1; r<=maxr; r++) { - if((vrank % (1<tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } /* we have to wait until we have the data */ res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } /* perform the reduce in my local buffer */ - if(firstred) { - if(rank == root) { - /* root is the only one who reduces in the receivebuffer - * take data from sendbuf in first round - save copy */ - res = NBC_Sched_op(recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule); - } else { - /* all others may not have a receive buffer - * take data from sendbuf in first round - save copy */ - res = NBC_Sched_op((char *)redbuf-(unsigned long)handle->tmpbuf, true, sendbuf, false, 0, true, count, datatype, op, schedule); - } + /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ + if (firstred && !inplace) { + /* perform the reduce with the senbuf */ + res = NBC_Sched_op (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule); firstred = 0; } else { - if(rank == root) { - /* root is the only one who reduces in the receivebuffer */ - res = NBC_Sched_op(recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule); - } else { - /* all others may not have a receive buffer */ - res = NBC_Sched_op((char *)redbuf-(unsigned long)handle->tmpbuf, true, (char *)redbuf-(unsigned long)handle->tmpbuf, true, 0, true, count, datatype, op, schedule); - } + /* perform the reduce in my local buffer */ + res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule); } if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } - /* this cannot be done until handle->tmpbuf is unused :-( */ + res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; + tmprbuf ^= 1; tmplbuf ^= 1; } } else { /* we have to send this round */ - vpeer = vrank - (1<<(r-1)); - VRANK2RANK(peer, vpeer, root) - if(firstred) { - /* we did not reduce anything */ - res = NBC_Sched_send(sendbuf, false, count, datatype, peer, schedule); + vpeer = vrank - (1 << (r - 1)); + VRANK2RANK(peer, vpeer, vroot) + if (firstred && !inplace) { + /* we have to use the sendbuf in the first round .. */ + res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule); } else { - /* we have to use the redbuf the root (which works in receivebuf) is never sending .. */ - res = NBC_Sched_send((char *)redbuf-(unsigned long)handle->tmpbuf, true, count, datatype, peer, schedule); + /* and the redbuf in all remaining rounds */ + res = NBC_Sched_send (lbuf, tmplbuf, count, datatype, peer, schedule); } if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } /* leave the game */ break; } } + /* send to root if vroot ! root */ + if (vroot != root) { + if (0 == rank) { + res = NBC_Sched_send (redbuf, false, count, datatype, root, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + } else if (root == rank) { + res = NBC_Sched_recv (redbuf, false, count, datatype, vroot, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + } + } return NBC_OK; } /* chain send ... */ -static inline int red_sched_chain(int rank, int p, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize) { +static inline int red_sched_chain(int rank, int p, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize) { int res, vrank, rpeer, speer, numfrag, fragnum, fragcount, thiscount; long offset; @@ -309,17 +350,29 @@ static inline int red_sched_chain(int rank, int p, int root, void *sendbuf, void } /* last node does not recv */ - if(vrank != p-1) { - res = NBC_Sched_recv((char*)offset, true, thiscount, datatype, rpeer, schedule); + if (vrank != p-1) { + if (vrank == 0) { + res = NBC_Sched_recv ((char *)recvbuf+offset, false, thiscount, datatype, rpeer, schedule); + } else { + res = NBC_Sched_recv ((char *) offset, true, thiscount, datatype, rpeer, schedule); + } if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + /* this barrier here seems awkward but isn't!!!! */ res = NBC_Sched_barrier(schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + /* root reduces into receivebuf */ if(vrank == 0) { - res = NBC_Sched_op((char*)recvbuf+offset, false, (char*)sendbuf+offset, false, (char*)offset, true, thiscount, datatype, op, schedule); + res = NBC_Sched_op ((char *) sendbuf + offset, false, (char *) recvbuf + offset, false, + thiscount, datatype, op, schedule); } else { - res = NBC_Sched_op((char*)offset, true, (char*)sendbuf+offset, false, (char*)offset, true, thiscount, datatype, op, schedule); + res = NBC_Sched_op ((char *) sendbuf + offset, false, (char *) offset, true, thiscount, + datatype, op, schedule); } + if (NBC_OK != res) { printf("Error in NBC_Sched_op() (%i)\n", res); return res; } + res = NBC_Sched_barrier(schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } } /* root does not send */ @@ -333,6 +386,7 @@ static inline int red_sched_chain(int rank, int p, int root, void *sendbuf, void if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } /* this barrier here seems awkward but isn't!!!! */ res = NBC_Sched_barrier(schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } } } @@ -340,31 +394,47 @@ static inline int red_sched_chain(int rank, int p, int root, void *sendbuf, void } /* simple linear algorithm for intercommunicators */ -static inline int red_sched_linear(int rank, int rsize, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { - int res, peer; +static inline int red_sched_linear (int rank, int rsize, int root, void *sendbuf, void *recvbuf, void *tmpbuf, int count, MPI_Datatype datatype, + MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { + int res; + char *rbuf, *lbuf, *buf; + int tmprbuf, tmplbuf; if(count == 0) return NBC_OK; if (MPI_ROOT == root) { - res = NBC_Sched_recv (recvbuf, false, count, datatype, 0, schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + /* ensure the result ends up in recvbuf */ + if (0 == (rsize%2)) { + lbuf = tmpbuf; + tmplbuf = true; + rbuf = recvbuf; + tmprbuf = false; + } else { + rbuf = tmpbuf; + tmprbuf = true; + lbuf = recvbuf; + tmplbuf = false; + } - res = NBC_Sched_barrier (schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + res = NBC_Sched_recv (lbuf, tmplbuf, count, datatype, 0, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } - for (peer = 1 ; peer < rsize ; ++peer) { - res = NBC_Sched_recv (0, true, count, datatype, peer, schedule); + for (int peer = 1 ; peer < rsize ; ++peer) { + res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } - res = NBC_Sched_op (recvbuf, false, 0, true, recvbuf, false, count, datatype, op, schedule); + res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_op() (%i)\n", res); return res; } res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; + tmprbuf ^= 1; tmplbuf ^= 1; } } else if (MPI_PROC_NULL != root) { res = NBC_Sched_send (sendbuf, false, count, datatype, root, schedule); diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c index 34fc8760d6..f69e5c0d21 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c @@ -6,7 +6,7 @@ * rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -39,20 +39,29 @@ int ompi_coll_libnbc_ireduce_scatter(void* sendbuf, void* recvbuf, int *recvcoun struct mca_coll_base_module_2_0_0_t *module) { int peer, rank, maxr, p, r, res, count, offset, firstred; MPI_Aint ext; - char *redbuf, *sbuf, inplace; + ptrdiff_t gap, span; + char *sbuf, inplace; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; - + char *rbuf, *lbuf, *buf; + NBC_IN_PLACE(sendbuf, recvbuf, inplace); + rank = ompi_comm_rank (comm); + p = ompi_comm_size (comm); + + res = ompi_datatype_type_extent (datatype, &ext); + if (MPI_SUCCESS != res) { printf("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } + + count = 0; + for(r=0;rsuper, count, &gap); - handle->tmpbuf = malloc(ext*count*2); + handle->tmpbuf = malloc (span * 2); if(handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } - redbuf = ((char*)handle->tmpbuf)+(ext*count); + rbuf = (char *)(-gap); + lbuf = (char *)(span - gap); firstred = 1; for(r=1; r<=maxr; r++) { @@ -95,23 +99,28 @@ int ompi_coll_libnbc_ireduce_scatter(void* sendbuf, void* recvbuf, int *recvcoun /* we have to receive this round */ peer = rank + (1<<(r-1)); if(peertmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } /* we have to wait until we have the data */ res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } - if(firstred) { + + /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ + if (firstred) { /* take reduce data from the sendbuf in the first round -> save copy */ - res = NBC_Sched_op(redbuf-(unsigned long)handle->tmpbuf, true, sendbuf, false, 0, true, count, datatype, op, schedule); + res = NBC_Sched_op (sendbuf, false, rbuf, true, count, datatype, op, schedule); firstred = 0; } else { /* perform the reduce in my local buffer */ - res = NBC_Sched_op(redbuf-(unsigned long)handle->tmpbuf, true, redbuf-(unsigned long)handle->tmpbuf, true, 0, true, count, datatype, op, schedule); + res = NBC_Sched_op (lbuf, true, rbuf, true, count, datatype, op, schedule); } if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } - /* this cannot be done until handle->tmpbuf is unused :-( */ + res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; } } else { /* we have to send this round */ @@ -120,8 +129,8 @@ int ompi_coll_libnbc_ireduce_scatter(void* sendbuf, void* recvbuf, int *recvcoun /* we have to send the senbuf */ res = NBC_Sched_send(sendbuf, false, count, datatype, peer, schedule); } else { - /* we send an already reduced value from redbuf */ - res = NBC_Sched_send(redbuf-(unsigned long)handle->tmpbuf, true, count, datatype, peer, schedule); + /* we send an already reduced value from lbuf */ + res = NBC_Sched_send (lbuf, true, count, datatype, peer, schedule); } if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } /* leave the game */ @@ -133,24 +142,24 @@ int ompi_coll_libnbc_ireduce_scatter(void* sendbuf, void* recvbuf, int *recvcoun if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } /* rank 0 is root and sends - all others receive */ - if(rank != 0) { - res = NBC_Sched_recv(recvbuf, false, recvcounts[rank], datatype, 0, schedule); - if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } - } - if(rank == 0) { offset = 0; for(r=1;rtmpbuf, true, recvcounts[r], datatype, r, schedule); + res = NBC_Sched_send (sbuf, true, recvcounts[r], datatype, r, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } - res = NBC_Sched_copy(redbuf-(unsigned long)handle->tmpbuf, true, recvcounts[0], datatype, recvbuf, false, recvcounts[0], datatype, schedule); + + res = NBC_Sched_copy (lbuf, true, recvcounts[0], datatype, recvbuf, false, recvcounts[0], datatype, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_copy() (%i)\n", res); return res; } + } else { + res = NBC_Sched_recv (recvbuf, false, recvcounts[rank], datatype, 0, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } } + /*NBC_PRINT_SCHED(*schedule);*/ res = NBC_Sched_commit(schedule); @@ -163,11 +172,12 @@ int ompi_coll_libnbc_ireduce_scatter(void* sendbuf, void* recvbuf, int *recvcoun return NBC_OK; } -int ompi_coll_libnbc_ireduce_scatter_inter(void* sendbuf, void* recvbuf, int *recvcounts, MPI_Datatype datatype, - MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, - struct mca_coll_base_module_2_0_0_t *module) { - int peer, rank, r, res, count, rsize, offset; +int ompi_coll_libnbc_ireduce_scatter_inter (void* sendbuf, void* recvbuf, int *recvcounts, MPI_Datatype datatype, + MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, + struct mca_coll_base_module_2_0_0_t *module) { + int rank, res, count, lsize, rsize; MPI_Aint ext; + ptrdiff_t gap, span; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; @@ -176,12 +186,13 @@ int ompi_coll_libnbc_ireduce_scatter_inter(void* sendbuf, void* recvbuf, int *re res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); - res = MPI_Comm_rank(comm, &rank); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } - res = MPI_Comm_remote_size(comm, &rsize); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_remote_size() (%i)\n", res); return res; } - MPI_Type_extent(datatype, &ext); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + rank = ompi_comm_rank (comm); + lsize = ompi_comm_size(comm); + rsize = ompi_comm_remote_size (comm); + + res = ompi_datatype_type_extent (datatype, &ext); + if (MPI_SUCCESS != res) { printf("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } + schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); if (NULL == schedule) { printf("Error in malloc()\n"); return NBC_OOR; } @@ -190,59 +201,75 @@ int ompi_coll_libnbc_ireduce_scatter_inter(void* sendbuf, void* recvbuf, int *re if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } count = 0; - for (r = 0 ; r < rsize ; ++r) count += recvcounts[r]; + for (int r = 0 ; r < lsize ; ++r) { + count += recvcounts[r]; + } - handle->tmpbuf = malloc(2 * ext * count); - if(handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } + span = opal_datatype_span(&datatype->super, count, &gap); + + if (count > 0) { + handle->tmpbuf = malloc (2 * span); + if(handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } + } /* send my data to the remote root */ res = NBC_Sched_send(sendbuf, false, count, datatype, 0, schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } if (0 == rank) { - res = NBC_Sched_recv((void *) 0, true, count, datatype, 0, schedule); + char *lbuf, *rbuf; + lbuf = (char *)(-gap); + rbuf = (char *)(span-gap); + res = NBC_Sched_recv (lbuf, true, count, datatype, 0, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } - for (peer = 1 ; peer < rsize ; ++peer) { - res = NBC_Sched_recv((void *)(ext * count), true, count, datatype, peer, schedule); + for (int peer = 1 ; peer < rsize ; ++peer) { + char *tbuf; + res = NBC_Sched_recv (rbuf, true, count, datatype, peer, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } res = NBC_Sched_barrier(schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } - res = NBC_Sched_op((void *) 0, true, (void *)(ext * count), true, (void *) 0, true, count, datatype, op, schedule); + res = NBC_Sched_op (lbuf, true, rbuf, true, count, datatype, op, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } res = NBC_Sched_barrier(schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + tbuf = lbuf; lbuf = rbuf; rbuf = tbuf; } /* exchange data with remote root for scatter phase (we *could* use the local communicator to do the scatter) */ - res = NBC_Sched_recv((void *)(ext * count), true, count, datatype, 0, schedule); + res = NBC_Sched_recv (rbuf, true, count, datatype, 0, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } - res = NBC_Sched_send((void *) 0, true, count, datatype, 0, schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + res = NBC_Sched_send (lbuf, true, count, datatype, 0, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } res = NBC_Sched_barrier(schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + + /* do the local scatterv with the local communicator */ + res = NBC_Sched_copy (lbuf, true, recvcounts[0], datatype, recvbuf, false, + recvcounts[0], datatype, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_copy() (%i)\n", res); return res; } + + for (int peer = 1, offset = recvcounts[0] * ext; peer < lsize ; ++peer) { + res = NBC_Sched_local_send (lbuf + offset, true, recvcounts[peer], datatype, peer, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_local_send() (%i)\n", res); return res; } - /* scatter */ - for (peer = 0, offset = ext * count ; peer < rsize ; ++peer) { - res = NBC_Sched_send((void *)(uintptr_t) offset, true, recvcounts[peer], datatype, peer, schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } offset += recvcounts[peer] * ext; } + } else { + /* receive my block */ + res = NBC_Sched_local_recv (recvbuf, false, recvcounts[rank], datatype, 0, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_local_recv() (%i)\n", res); return res; } } - /* receive my block */ - res = NBC_Sched_recv(recvbuf, false, recvcounts[rank], datatype, 0, schedule); - if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } - /*NBC_PRINT_SCHED(*schedule);*/ res = NBC_Sched_commit(schedule); diff --git a/ompi/mca/coll/libnbc/coll_libnbc_ireduce_scatter_block.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c similarity index 70% rename from ompi/mca/coll/libnbc/coll_libnbc_ireduce_scatter_block.c rename to ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c index e40439b15e..24ecf72236 100644 --- a/ompi/mca/coll/libnbc/coll_libnbc_ireduce_scatter_block.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c @@ -37,6 +37,7 @@ int ompi_coll_libnbc_ireduce_scatter_block(void* sendbuf, void* recvbuf, int rec struct mca_coll_base_module_2_0_0_t *module) { int peer, rank, maxr, p, r, res, count, offset, firstred; MPI_Aint ext; + ptrdiff_t gap, span; char *redbuf, *sbuf, inplace; NBC_Schedule *schedule; NBC_Handle *handle; @@ -45,15 +46,14 @@ int ompi_coll_libnbc_ireduce_scatter_block(void* sendbuf, void* recvbuf, int rec NBC_IN_PLACE(sendbuf, recvbuf, inplace); + rank = ompi_comm_rank (comm); + p = ompi_comm_size (comm); + res = MPI_Type_extent(datatype, &ext); + if (MPI_SUCCESS != res || 0 == ext) { printf("MPI Error in MPI_Type_extent() (%i:%i)\n", res, (int)ext); return (MPI_SUCCESS == res) ? MPI_ERR_SIZE : res; } + res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); - res = MPI_Comm_rank(comm, &rank); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } - res = MPI_Comm_size(comm, &p); - if (MPI_SUCCESS != res || 0 == p) { printf("MPI Error in MPI_Comm_size() (%i:%i)\n", res, p); return (MPI_SUCCESS == res) ? MPI_ERR_SIZE : res; } - MPI_Type_extent(datatype, &ext); - if (MPI_SUCCESS != res || 0 == ext) { printf("MPI Error in MPI_Type_extent() (%i:%i)\n", res, (int)ext); return (MPI_SUCCESS == res) ? MPI_ERR_SIZE : res; } schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); if (NULL == schedule) { printf("Error in malloc()\n"); return NBC_OOR; } @@ -66,10 +66,15 @@ int ompi_coll_libnbc_ireduce_scatter_block(void* sendbuf, void* recvbuf, int rec count = p * recvcount; if (0 < count) { - handle->tmpbuf = malloc(ext*count*2); - if(handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } + char *rbuf, *lbuf, *buf; + + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (2*span); + if (NULL == handle->tmpbuf) { printf("Error in malloc()\n"); return NBC_OOR; } - redbuf = ((char*)handle->tmpbuf)+(ext*count); + rbuf = (void *)(-gap); + lbuf = (char *)(span - gap); + redbuf = (char *) handle->tmpbuf + span - gap; /* copy data to redbuf if we only have a single node */ if((p==1) && !inplace) { @@ -83,23 +88,25 @@ int ompi_coll_libnbc_ireduce_scatter_block(void* sendbuf, void* recvbuf, int rec /* we have to receive this round */ peer = rank + (1<<(r-1)); if(peertmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } /* we have to wait until we have the data */ res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } if(firstred) { /* take reduce data from the sendbuf in the first round -> save copy */ - res = NBC_Sched_op(redbuf-(unsigned long)handle->tmpbuf, true, sendbuf, false, 0, true, count, datatype, op, schedule); + res = NBC_Sched_op (sendbuf, false, rbuf, true, count, datatype, op, schedule); firstred = 0; } else { /* perform the reduce in my local buffer */ - res = NBC_Sched_op(redbuf-(unsigned long)handle->tmpbuf, true, redbuf-(unsigned long)handle->tmpbuf, true, 0, true, count, datatype, op, schedule); + res = NBC_Sched_op (lbuf, true, rbuf, true, count, datatype, op, schedule); } if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } /* this cannot be done until handle->tmpbuf is unused :-( */ res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; } } else { /* we have to send this round */ @@ -109,7 +116,7 @@ int ompi_coll_libnbc_ireduce_scatter_block(void* sendbuf, void* recvbuf, int rec res = NBC_Sched_send(sendbuf, false, count, datatype, peer, schedule); } else { /* we send an already reduced value from redbuf */ - res = NBC_Sched_send(redbuf-(unsigned long)handle->tmpbuf, true, count, datatype, peer, schedule); + res = NBC_Sched_send(lbuf, true, count, datatype, peer, schedule); } if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } /* leave the game */ @@ -124,19 +131,19 @@ int ompi_coll_libnbc_ireduce_scatter_block(void* sendbuf, void* recvbuf, int rec if(rank != 0) { res = NBC_Sched_recv(recvbuf, false, recvcount, datatype, 0, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } - } - - if(rank == 0) { + } else { offset = 0; for(r=1;rtmpbuf, true, recvcount, datatype, r, schedule); + res = NBC_Sched_send(sbuf, true, recvcount, datatype, r, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } - res = NBC_Sched_copy(redbuf-(unsigned long)handle->tmpbuf, true, recvcount, datatype, recvbuf, false, recvcount, datatype, schedule); - if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_copy() (%i)\n", res); return res; } + if ((p != 1) || !inplace) { + res = NBC_Sched_copy (lbuf, true, recvcount, datatype, recvbuf, false, recvcount, datatype, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_copy() (%i)\n", res); return res; } + } } } @@ -152,11 +159,12 @@ int ompi_coll_libnbc_ireduce_scatter_block(void* sendbuf, void* recvbuf, int rec return NBC_OK; } -int ompi_coll_libnbc_ireduce_scatter_block_inter(void *sbuf, void *rbuf, int rcount, struct ompi_datatype_t *dtype, +int ompi_coll_libnbc_ireduce_scatter_block_inter(void *sendbuf, void *recvbuf, int rcount, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, ompi_request_t **request, struct mca_coll_base_module_2_0_0_t *module) { - int peer, rank, res, count, rsize; + int peer, rank, res, count, lsize, rsize; MPI_Aint ext; + ptrdiff_t gap, span; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; @@ -165,12 +173,13 @@ int ompi_coll_libnbc_ireduce_scatter_block_inter(void *sbuf, void *rbuf, int rco res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); - res = MPI_Comm_rank(comm, &rank); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } - res = MPI_Comm_remote_size(comm, &rsize); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_remote_size() (%i)\n", res); return res; } - MPI_Type_extent(dtype, &ext); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } + + rank = ompi_comm_rank (comm); + lsize = ompi_comm_size (comm); + rsize = ompi_comm_remote_size (comm); + + res = ompi_datatype_type_extent (dtype, &ext); + if (MPI_SUCCESS != res) { printf ("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); if (NULL == schedule) { printf("Error in malloc()\n"); return NBC_OOR; } @@ -178,58 +187,60 @@ int ompi_coll_libnbc_ireduce_scatter_block_inter(void *sbuf, void *rbuf, int rco res = NBC_Sched_create(schedule); if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } - count = rcount * rsize; + count = rcount * lsize; + span = opal_datatype_span(&dtype->super, count, &gap); - handle->tmpbuf = malloc(2*ext*count); - if(handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } + if (count > 0) { + handle->tmpbuf = malloc (2 * span); + if(handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } + } /* send my data to the remote root */ - res = NBC_Sched_send(sbuf, false, count, dtype, 0, schedule); + res = NBC_Sched_send(sendbuf, false, count, dtype, 0, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } if (0 == rank) { - res = NBC_Sched_recv((void *) 0, true, count, dtype, 0, schedule); + char *lbuf, *rbuf; + lbuf = (char *)(-gap); + rbuf = (char *)(span-gap); + res = NBC_Sched_recv (lbuf, true, count, dtype, 0, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } for (peer = 1 ; peer < rsize ; ++peer) { - res = NBC_Sched_recv((void *)(ext * count), true, count, dtype, peer, schedule); + char *tbuf; + + res = NBC_Sched_recv (rbuf, true, count, dtype, peer, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } - res = NBC_Sched_op((void *) 0, true, (void *)(ext * count), true, (void *) 0, true, count, dtype, op, schedule); + res = NBC_Sched_op (lbuf, true, rbuf, true, count, dtype, op, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } + tbuf = lbuf; lbuf = rbuf; rbuf = tbuf; } - /* exchange data with remote root for scatter phase (we *could* use the local communicator to do the scatter) */ - res = NBC_Sched_recv((void *)(ext * count), true, count, dtype, 0, schedule); - if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } + /* do the scatter with the local communicator */ + res = NBC_Sched_copy (lbuf, true, rcount, dtype, recvbuf, false, rcount, dtype, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_copy() (%i)\n", res); return res; } - res = NBC_Sched_send((void *) 0, true, count, dtype, 0, schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } - - res = NBC_Sched_barrier(schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } - - /* scatter */ - for (peer = 0 ; peer < rsize ; ++peer) { - res = NBC_Sched_send((void *)(ext * (count + peer * rcount)), true, rcount, dtype, peer, schedule); - if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } + for (int peer = 1 ; peer < lsize ; ++peer) { + res = NBC_Sched_local_send (lbuf + ext * rcount * peer, true, rcount, dtype, peer, schedule); + if (NBC_OK != res) { printf("Error in NBC_Sched_local_send() (%i)\n", res); return res; } } + } else { + /* receive my block */ + res = NBC_Sched_local_recv(recvbuf, false, rcount, dtype, 0, schedule); + if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_local_recv() (%i)\n", res); return res; } } - /* receive my block */ - res = NBC_Sched_recv(rbuf, true, rcount, dtype, 0, schedule); - if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } - /*NBC_PRINT_SCHED(*schedule);*/ res = NBC_Sched_commit(schedule); diff --git a/ompi/mca/coll/libnbc/nbc_iscan.c b/ompi/mca/coll/libnbc/nbc_iscan.c index 8b6ed6d5fd..4102be0019 100644 --- a/ompi/mca/coll/libnbc/nbc_iscan.c +++ b/ompi/mca/coll/libnbc/nbc_iscan.c @@ -30,16 +30,16 @@ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { /* linear iscan * working principle: - * 1. each node (but node 0) receives from left neigbor + * 1. each node (but node 0) receives from left neighbor * 2. performs op - * 3. all but rank p-1 do sends to it's right neigbor and exits + * 3. all but rank p-1 do sends to it's right neighbor and exits * */ int ompi_coll_libnbc_iscan(void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_0_0_t *module) { int rank, p, res; - MPI_Aint ext; + ptrdiff_t gap, span; NBC_Schedule *schedule; #ifdef NBC_CACHE_SCHEDULE NBC_Scan_args *args, *found, search; @@ -54,20 +54,14 @@ int ompi_coll_libnbc_iscan(void* sendbuf, void* recvbuf, int count, MPI_Datatype res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); - res = MPI_Comm_rank(comm, &rank); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } - res = MPI_Comm_size(comm, &p); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } - res = MPI_Type_extent(datatype, &ext); - if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } - - handle->tmpbuf = malloc(ext*count); - if(handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } - if((rank == 0) && !inplace) { + rank = ompi_comm_rank (comm); + p = ompi_comm_size (comm); + + if (!inplace) { /* copy data to receivebuf */ - res = NBC_Copy(sendbuf, count, datatype, recvbuf, count, datatype, comm); - if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } + res = NBC_Copy (sendbuf, count, datatype, recvbuf, count, datatype, comm); + if(res != NBC_OK) { printf("Error in NBC_Copy(%i)\n", res); return res; } } #ifdef NBC_CACHE_SCHEDULE @@ -87,15 +81,20 @@ int ompi_coll_libnbc_iscan(void* sendbuf, void* recvbuf, int count, MPI_Datatype if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } if(rank != 0) { - res = NBC_Sched_recv(0, true, count, datatype, rank-1, schedule); + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span); + if(handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } + + /* we have to wait until we have the data */ + res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, rank-1, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } /* we have to wait until we have the data */ res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } /* perform the reduce in my local buffer */ - res = NBC_Sched_op(recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule); + /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ + res = NBC_Sched_op ((void *)(-gap), true, recvbuf, false, count, datatype, op, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } - /* this cannot be done until handle->tmpbuf is unused :-( */ res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } } From efa0df021c02fad32017c9029f2c5373186ad11a Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 8 Jun 2016 16:48:00 +0900 Subject: [PATCH 08/11] coll/base: silence misc warning as reported by Coverity with CIDs 1363349-1363362 (back-ported from commit open-mpi/ompi@7b8094aac12b33119e327aa0692fcd8d0e85a958) --- ompi/mca/coll/basic/coll_basic_allgather.c | 9 ++++----- ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c | 2 +- ompi/mca/coll/inter/coll_inter_allgather.c | 6 +++--- ompi/mca/coll/inter/coll_inter_allgatherv.c | 2 +- ompi/mca/coll/inter/coll_inter_gather.c | 8 ++++---- ompi/mca/coll/inter/coll_inter_gatherv.c | 2 +- ompi/mca/coll/inter/coll_inter_scatter.c | 4 ++-- ompi/mca/coll/inter/coll_inter_scatterv.c | 2 +- ompi/mca/coll/libnbc/nbc_iexscan.c | 4 ++-- 9 files changed, 19 insertions(+), 20 deletions(-) diff --git a/ompi/mca/coll/basic/coll_basic_allgather.c b/ompi/mca/coll/basic/coll_basic_allgather.c index 3bd0560c5b..5679496848 100644 --- a/ompi/mca/coll/basic/coll_basic_allgather.c +++ b/ompi/mca/coll/basic/coll_basic_allgather.c @@ -158,12 +158,11 @@ mca_coll_basic_allgather_inter(void *sbuf, int scount, goto exit; } - span = opal_datatype_span(&sdtype->super, scount * size, &gap); + /* Step 2: exchange the resuts between the root processes */ + span = opal_datatype_span(&sdtype->super, (int64_t)scount * (int64_t)size, &gap); tmpbuf_free = (char *) malloc(span); - if (NULL == tmpbuf_free) { - err = OMPI_ERR_OUT_OF_RESOURCE; - goto exit; - } + if (NULL == tmpbuf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } + tmpbuf = tmpbuf_free - gap; err = MCA_PML_CALL(isend(rbuf, rsize * rcount, rdtype, 0, diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c index ce45bb4ac1..b725919368 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c +++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c @@ -127,7 +127,7 @@ mca_coll_basic_reduce_scatter_block_inter(void *sbuf, void *rbuf, int rcount, int totalcounts; ptrdiff_t gap, span; char *tmpbuf = NULL, *tmpbuf2 = NULL; - char *lbuf, *buf; + char *lbuf = NULL, *buf; ompi_request_t *req; rank = ompi_comm_rank(comm); diff --git a/ompi/mca/coll/inter/coll_inter_allgather.c b/ompi/mca/coll/inter/coll_inter_allgather.c index 4454b4534c..62988be986 100644 --- a/ompi/mca/coll/inter/coll_inter_allgather.c +++ b/ompi/mca/coll/inter/coll_inter_allgather.c @@ -48,8 +48,8 @@ mca_coll_inter_allgather_inter(void *sbuf, int scount, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int rank, root = 0, size, rsize, err; - char *ptmp_free = NULL, *ptmp; + int rank, root = 0, size, rsize, err = OMPI_SUCCESS; + char *ptmp_free = NULL, *ptmp = NULL; ptrdiff_t gap, span; ompi_request_t *req[2]; @@ -59,7 +59,7 @@ mca_coll_inter_allgather_inter(void *sbuf, int scount, /* Perform the gather locally at the root */ if ( scount > 0 ) { - span = opal_datatype_span(&sdtype->super, scount*size, &gap); + span = opal_datatype_span(&sdtype->super, (int64_t)scount*(int64_t)size, &gap); ptmp_free = (char*)malloc(span); if (NULL == ptmp_free) { return OMPI_ERR_OUT_OF_RESOURCE; diff --git a/ompi/mca/coll/inter/coll_inter_allgatherv.c b/ompi/mca/coll/inter/coll_inter_allgatherv.c index 6e15c1befc..bca15e07c6 100644 --- a/ompi/mca/coll/inter/coll_inter_allgatherv.c +++ b/ompi/mca/coll/inter/coll_inter_allgatherv.c @@ -49,7 +49,7 @@ mca_coll_inter_allgatherv_inter(void *sbuf, int scount, { int i, rank, size, size_local, total=0, err; int *count=NULL,*displace=NULL; - char *ptmp_free=NULL, *ptmp; + char *ptmp_free=NULL, *ptmp=NULL; ompi_datatype_t *ndtype = NULL; ompi_request_t *req[2]; diff --git a/ompi/mca/coll/inter/coll_inter_gather.c b/ompi/mca/coll/inter/coll_inter_gather.c index 177a1278aa..39128e5610 100644 --- a/ompi/mca/coll/inter/coll_inter_gather.c +++ b/ompi/mca/coll/inter/coll_inter_gather.c @@ -61,11 +61,11 @@ mca_coll_inter_gather_inter(void *sbuf, int scount, int size_local; ptrdiff_t gap, span; - size_local = ompi_comm_size(comm->c_local_comm); - span = opal_datatype_span(&sdtype->super, scount*size_local, &gap); + size_local = ompi_comm_size(comm->c_local_comm); + span = opal_datatype_span(&sdtype->super, (int64_t)scount*(int64_t)size_local, &gap); - ptmp_free = (char*)malloc(span); - if (NULL == ptmp_free) { + ptmp_free = (char*)malloc(span); + if (NULL == ptmp_free) { return OMPI_ERR_OUT_OF_RESOURCE; } ptmp = ptmp_free - gap; diff --git a/ompi/mca/coll/inter/coll_inter_gatherv.c b/ompi/mca/coll/inter/coll_inter_gatherv.c index 9aa0720541..875e2a468d 100644 --- a/ompi/mca/coll/inter/coll_inter_gatherv.c +++ b/ompi/mca/coll/inter/coll_inter_gatherv.c @@ -46,7 +46,7 @@ mca_coll_inter_gatherv_inter(void *sbuf, int scount, { int i, rank, size, size_local, total=0, err; int *count=NULL, *displace=NULL; - char *ptmp_free=NULL, *ptmp; + char *ptmp_free=NULL, *ptmp=NULL; ompi_datatype_t *ndtype; if (MPI_PROC_NULL == root) { /* do nothing */ diff --git a/ompi/mca/coll/inter/coll_inter_scatter.c b/ompi/mca/coll/inter/coll_inter_scatter.c index 190f013b54..6163568809 100644 --- a/ompi/mca/coll/inter/coll_inter_scatter.c +++ b/ompi/mca/coll/inter/coll_inter_scatter.c @@ -56,13 +56,13 @@ mca_coll_inter_scatter_inter(void *sbuf, int scount, err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* First process receives the data from root */ - char *ptmp_free = NULL, *ptmp; + char *ptmp_free = NULL, *ptmp = NULL; if(0 == rank) { int size_local; ptrdiff_t gap, span; size_local = ompi_comm_size(comm->c_local_comm); - span = opal_datatype_span(&rdtype->super, rcount*size_local, &gap); + span = opal_datatype_span(&rdtype->super, (int64_t)rcount*(int64_t)size_local, &gap); ptmp_free = malloc(span); if (NULL == ptmp_free) { return OMPI_ERR_OUT_OF_RESOURCE; diff --git a/ompi/mca/coll/inter/coll_inter_scatterv.c b/ompi/mca/coll/inter/coll_inter_scatterv.c index 6eaa133959..3f710b1051 100644 --- a/ompi/mca/coll/inter/coll_inter_scatterv.c +++ b/ompi/mca/coll/inter/coll_inter_scatterv.c @@ -46,7 +46,7 @@ mca_coll_inter_scatterv_inter(void *sbuf, int *scounts, { int i, rank, size, err, total=0, size_local; int *counts=NULL,*displace=NULL; - char *ptmp_free=NULL, *ptmp; + char *ptmp_free=NULL, *ptmp=NULL; ompi_datatype_t *ndtype; /* Initialize */ diff --git a/ompi/mca/coll/libnbc/nbc_iexscan.c b/ompi/mca/coll/libnbc/nbc_iexscan.c index 90383a6aeb..7af0777bdb 100644 --- a/ompi/mca/coll/libnbc/nbc_iexscan.c +++ b/ompi/mca/coll/libnbc/nbc_iexscan.c @@ -67,9 +67,9 @@ int ompi_coll_libnbc_iexscan(void* sendbuf, void* recvbuf, int count, MPI_Dataty handle->tmpbuf = malloc(span); if (handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } if (inplace) { - NBC_Copy(recvbuf, count, datatype, (char *)handle->tmpbuf-gap, count, datatype, comm); + res = NBC_Copy(recvbuf, count, datatype, (char *)handle->tmpbuf-gap, count, datatype, comm); } else { - NBC_Copy(sendbuf, count, datatype, (char *)handle->tmpbuf-gap, count, datatype, comm); + res = NBC_Copy(sendbuf, count, datatype, (char *)handle->tmpbuf-gap, count, datatype, comm); } } From ac36c80daa3c144fbd74344d3ded3375d0dd655d Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Fri, 8 Jul 2016 16:55:26 +0900 Subject: [PATCH 09/11] coll/base: fix non zero lower bound datatype handling in mca_coll_base_alltoallv_intra_basic_inplace() (back-ported from commit open-mpi/ompi@a55d57406be6f19f35844aa605219b92d58f745f) --- ompi/mca/coll/tuned/coll_tuned_alltoallv.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoallv.c b/ompi/mca/coll/tuned/coll_tuned_alltoallv.c index 5c9f4b1b21..fefc8a032c 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoallv.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoallv.c @@ -14,7 +14,7 @@ * Copyright (c) 2013 Los Alamos National Security, LLC. All Rights * reserved. * Copyright (c) 2013 FUJITSU LIMITED. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ @@ -58,7 +58,7 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; int i, j, size, rank, err=MPI_SUCCESS; MPI_Request *preq; - char *tmp_buffer; + char *allocated_buffer, *tmp_buffer; size_t max_size, rdtype_size; OPAL_PTRDIFF_TYPE ext, gap; @@ -82,11 +82,11 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con /* The gap will always be the same as we are working on the same datatype */ /* Allocate a temporary buffer */ - tmp_buffer = calloc (max_size, 1); - if (NULL == tmp_buffer) { + allocated_buffer = calloc (max_size, 1); + if (NULL == allocated_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - tmp_buffer += gap; + tmp_buffer = allocated_buffer - gap; /* in-place alltoallv slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { @@ -139,7 +139,7 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con error_hndl: /* Free the temporary buffer */ - free (tmp_buffer); + free (allocated_buffer); /* All done */ From 5dbf964b200bfe8afdb57d3ecb83f157835ea20d Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Thu, 9 Jun 2016 13:12:25 +0900 Subject: [PATCH 10/11] coll/base: fix memory free in ompi_coll_base_allreduce_intra_recursivedoubling err handler Fix CID 1362630 Fixes open-mpi/ompi@0e393195d9f2373ffa9d59a240092f643117cd39 (back-ported from commit open-mpi/ompi@80e362de521409897da64f5f267da33946135d35) --- ompi/mca/coll/tuned/coll_tuned_allreduce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ompi/mca/coll/tuned/coll_tuned_allreduce.c b/ompi/mca/coll/tuned/coll_tuned_allreduce.c index 14dfd8dd45..6f4e084a89 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allreduce.c +++ b/ompi/mca/coll/tuned/coll_tuned_allreduce.c @@ -290,7 +290,8 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, opal_output_verbose(COLL_TUNED_VERBOSITY, ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret); - if (NULL != inplacebuf) free(inplacebuf); + (void)line; // silence compiler warning + if (NULL != inplacebuf_free) free(inplacebuf_free); return ret; } From a6cf208a883d5dc7c79bb93f40eb4ef7546aabcf Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Mon, 11 Jul 2016 17:18:30 +0900 Subject: [PATCH 11/11] coll/libnbc: do not exchange data between roots in ompi_coll_libnbc_ireduce_scatter_inter() this is now useless since the scatter is done via the local communicator (back-ported from commit open-mpi/ompi@14624506df560975e03fb9ef9d01822728aa84a4) --- ompi/mca/coll/libnbc/nbc_ireduce_scatter.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c index f69e5c0d21..01b0c01861 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c @@ -243,16 +243,6 @@ int ompi_coll_libnbc_ireduce_scatter_inter (void* sendbuf, void* recvbuf, int *r tbuf = lbuf; lbuf = rbuf; rbuf = tbuf; } - /* exchange data with remote root for scatter phase (we *could* use the local communicator to do the scatter) */ - res = NBC_Sched_recv (rbuf, true, count, datatype, 0, schedule); - if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } - - res = NBC_Sched_send (lbuf, true, count, datatype, 0, schedule); - if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } - - res = NBC_Sched_barrier(schedule); - if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } - /* do the local scatterv with the local communicator */ res = NBC_Sched_copy (lbuf, true, recvcounts[0], datatype, recvbuf, false, recvcounts[0], datatype, schedule);