From c8eff467b6763f4f20d246c8089346a2f4208fc5 Mon Sep 17 00:00:00 2001 From: Akshay Venkatesh Date: Tue, 13 Aug 2024 18:09:05 +0000 Subject: [PATCH] ompi/coll/cuda: implement reduce_local Signed-off-by: Akshay Venkatesh --- ompi/mca/coll/cuda/coll_cuda.h | 6 ++++ ompi/mca/coll/cuda/coll_cuda_module.c | 2 ++ ompi/mca/coll/cuda/coll_cuda_reduce.c | 48 +++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/ompi/mca/coll/cuda/coll_cuda.h b/ompi/mca/coll/cuda/coll_cuda.h index 6b566c8eb1f..d281956f4b4 100644 --- a/ompi/mca/coll/cuda/coll_cuda.h +++ b/ompi/mca/coll/cuda/coll_cuda.h @@ -1,4 +1,5 @@ /* + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * Copyright (c) 2014 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -45,6 +46,11 @@ mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count, struct ompi_communicator_t *comm, mca_coll_base_module_t *module); +int mca_coll_cuda_reduce_local(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + mca_coll_base_module_t *module); + int mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, diff --git a/ompi/mca/coll/cuda/coll_cuda_module.c b/ompi/mca/coll/cuda/coll_cuda_module.c index 137f55a7636..44ac08cb093 100644 --- a/ompi/mca/coll/cuda/coll_cuda_module.c +++ b/ompi/mca/coll/cuda/coll_cuda_module.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * Copyright (c) 2014-2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -104,6 +105,7 @@ mca_coll_cuda_comm_query(struct ompi_communicator_t *comm, cuda_module->super.coll_gather = NULL; cuda_module->super.coll_gatherv = NULL; cuda_module->super.coll_reduce = mca_coll_cuda_reduce; + cuda_module->super.coll_reduce_local = mca_coll_cuda_reduce_local; cuda_module->super.coll_reduce_scatter = NULL; cuda_module->super.coll_reduce_scatter_block = mca_coll_cuda_reduce_scatter_block; cuda_module->super.coll_scan = mca_coll_cuda_scan; diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce.c b/ompi/mca/coll/cuda/coll_cuda_reduce.c index 1b46325ea57..0433133f9ab 100644 --- a/ompi/mca/coll/cuda/coll_cuda_reduce.c +++ b/ompi/mca/coll/cuda/coll_cuda_reduce.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -78,3 +79,50 @@ mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count, } return rc; } + +int +mca_coll_cuda_reduce_local(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + mca_coll_base_module_t *module) +{ + ptrdiff_t gap; + char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; + size_t bufsize; + int rc; + + bufsize = opal_datatype_span(&dtype->super, count, &gap); + + if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { + sbuf1 = (char*)malloc(bufsize); + if (NULL == sbuf1) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize); + sbuf = sbuf1 - gap; + } + + if (opal_cuda_check_bufs((char *)rbuf, NULL)) { + rbuf1 = (char*)malloc(bufsize); + if (NULL == rbuf1) { + if (NULL != sbuf1) free(sbuf1); + return OMPI_ERR_OUT_OF_RESOURCE; + } + opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize); + rbuf2 = rbuf; /* save away original buffer */ + rbuf = rbuf1 - gap; + } + + ompi_op_reduce(op, (void *)sbuf, rbuf, count, dtype); + rc = OMPI_SUCCESS; + + if (NULL != sbuf1) { + free(sbuf1); + } + if (NULL != rbuf1) { + rbuf = rbuf2; + opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize); + free(rbuf1); + } + return rc; +}