From c8eff467b6763f4f20d246c8089346a2f4208fc5 Mon Sep 17 00:00:00 2001
From: Akshay Venkatesh <akvenkatesh@nvidia.com>
Date: Tue, 13 Aug 2024 18:09:05 +0000
Subject: [PATCH] ompi/coll/cuda: implement reduce_local

Signed-off-by: Akshay Venkatesh <akvenkatesh@nvidia.com>
---
 ompi/mca/coll/cuda/coll_cuda.h        |  6 ++++
 ompi/mca/coll/cuda/coll_cuda_module.c |  2 ++
 ompi/mca/coll/cuda/coll_cuda_reduce.c | 48 +++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

diff --git a/ompi/mca/coll/cuda/coll_cuda.h b/ompi/mca/coll/cuda/coll_cuda.h
index 6b566c8eb1f..d281956f4b4 100644
--- a/ompi/mca/coll/cuda/coll_cuda.h
+++ b/ompi/mca/coll/cuda/coll_cuda.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  * Copyright (c) 2014      The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
@@ -45,6 +46,11 @@ mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count,
                         struct ompi_communicator_t *comm,
                         mca_coll_base_module_t *module);
 
+int mca_coll_cuda_reduce_local(const void *sbuf, void *rbuf, int count,
+                               struct ompi_datatype_t *dtype,
+                               struct ompi_op_t *op,
+                               mca_coll_base_module_t *module);
+
 int mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
                          struct ompi_datatype_t *dtype,
                          struct ompi_op_t *op,
diff --git a/ompi/mca/coll/cuda/coll_cuda_module.c b/ompi/mca/coll/cuda/coll_cuda_module.c
index 137f55a7636..44ac08cb093 100644
--- a/ompi/mca/coll/cuda/coll_cuda_module.c
+++ b/ompi/mca/coll/cuda/coll_cuda_module.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  * Copyright (c) 2014-2017 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
@@ -104,6 +105,7 @@ mca_coll_cuda_comm_query(struct ompi_communicator_t *comm,
     cuda_module->super.coll_gather     = NULL;
     cuda_module->super.coll_gatherv    = NULL;
     cuda_module->super.coll_reduce     = mca_coll_cuda_reduce;
+    cuda_module->super.coll_reduce_local   = mca_coll_cuda_reduce_local;
     cuda_module->super.coll_reduce_scatter = NULL;
     cuda_module->super.coll_reduce_scatter_block = mca_coll_cuda_reduce_scatter_block;
     cuda_module->super.coll_scan       = mca_coll_cuda_scan;
diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce.c b/ompi/mca/coll/cuda/coll_cuda_reduce.c
index 1b46325ea57..0433133f9ab 100644
--- a/ompi/mca/coll/cuda/coll_cuda_reduce.c
+++ b/ompi/mca/coll/cuda/coll_cuda_reduce.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
@@ -78,3 +79,50 @@ mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
     }
     return rc;
 }
+
+int
+mca_coll_cuda_reduce_local(const void *sbuf, void *rbuf, int count,
+                           struct ompi_datatype_t *dtype,
+                           struct ompi_op_t *op,
+                           mca_coll_base_module_t *module)
+{
+    ptrdiff_t gap;
+    char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL;
+    size_t bufsize;
+    int rc;
+
+    bufsize = opal_datatype_span(&dtype->super, count, &gap);
+
+    if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
+        sbuf1 = (char*)malloc(bufsize);
+        if (NULL == sbuf1) {
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        }
+        opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize);
+        sbuf = sbuf1 - gap;
+    }
+
+    if (opal_cuda_check_bufs((char *)rbuf, NULL)) {
+        rbuf1 = (char*)malloc(bufsize);
+        if (NULL == rbuf1) {
+            if (NULL != sbuf1) free(sbuf1);
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        }
+        opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize);
+        rbuf2 = rbuf; /* save away original buffer */
+        rbuf = rbuf1 - gap;
+    }
+
+    ompi_op_reduce(op, (void *)sbuf, rbuf, count, dtype);
+    rc = OMPI_SUCCESS;
+
+    if (NULL != sbuf1) {
+        free(sbuf1);
+    }
+    if (NULL != rbuf1) {
+        rbuf = rbuf2;
+        opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize);
+        free(rbuf1);
+    }
+    return rc;
+}