diff --git a/ompi/mca/coll/accelerator/coll_accelerator.h b/ompi/mca/coll/accelerator/coll_accelerator.h index e707d7ec7f2..70d971cc9a8 100644 --- a/ompi/mca/coll/accelerator/coll_accelerator.h +++ b/ompi/mca/coll/accelerator/coll_accelerator.h @@ -5,6 +5,7 @@ * reserved. * Copyright (c) 2014-2024 NVIDIA Corporation. All rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -87,22 +88,24 @@ mca_coll_accelerator_reduce_scatter_block(const void *sbuf, void *rbuf, size_t r * @retval >0 The buffer belongs to a managed buffer in * device memory. */ -static inline int mca_coll_accelerator_check_buf(void *addr) +static inline int mca_coll_accelerator_check_buf(void *addr, int *dev_id) { uint64_t flags; - int dev_id; + if (OPAL_LIKELY(NULL != addr)) { - return opal_accelerator.check_addr(addr, &dev_id, &flags); + return opal_accelerator.check_addr(addr, dev_id, &flags); } else { + *dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; return 0; } } -static inline void *mca_coll_accelerator_memcpy(void *dest, const void *src, size_t size) +static inline void *mca_coll_accelerator_memcpy(void *dest, int dest_dev, const void *src, int src_dev, size_t size, + opal_accelerator_transfer_type_t type) { int res; - res = opal_accelerator.mem_copy(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID, - dest, src, size, MCA_ACCELERATOR_TRANSFER_UNSPEC); + + res = opal_accelerator.mem_copy(dest_dev, src_dev, dest, src, size, type); if (res != 0) { opal_output(0, "coll/accelerator: Error in mem_copy: res=%d, dest=%p, src=%p, size=%d", res, dest, src, (int) size); diff --git a/ompi/mca/coll/accelerator/coll_accelerator_allreduce.c b/ompi/mca/coll/accelerator/coll_accelerator_allreduce.c index ad0566cad11..75e9051ce0a 100644 --- a/ompi/mca/coll/accelerator/coll_accelerator_allreduce.c +++ b/ompi/mca/coll/accelerator/coll_accelerator_allreduce.c @@ -5,6 +5,7 @@ * Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved. * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -37,11 +38,12 @@ mca_coll_accelerator_allreduce(const void *sbuf, void *rbuf, size_t count, mca_coll_accelerator_module_t *s = (mca_coll_accelerator_module_t*) module; ptrdiff_t gap; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; + int sbuf_dev, rbuf_dev; size_t bufsize; int rc; bufsize = opal_datatype_span(&dtype->super, count, &gap); - rc = mca_coll_accelerator_check_buf((void *)sbuf); + rc = mca_coll_accelerator_check_buf((void *)sbuf, &sbuf_dev); if (rc < 0) { return rc; } @@ -50,10 +52,11 @@ mca_coll_accelerator_allreduce(const void *sbuf, void *rbuf, size_t count, if (NULL == sbuf1) { return OMPI_ERR_OUT_OF_RESOURCE; } - mca_coll_accelerator_memcpy(sbuf1, sbuf, bufsize); + mca_coll_accelerator_memcpy(sbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, sbuf, sbuf_dev, + bufsize, MCA_ACCELERATOR_TRANSFER_DTOH); sbuf = sbuf1 - gap; } - rc = mca_coll_accelerator_check_buf(rbuf); + rc = mca_coll_accelerator_check_buf(rbuf, &rbuf_dev); if (rc < 0) { return rc; } @@ -63,7 +66,8 @@ mca_coll_accelerator_allreduce(const void *sbuf, void *rbuf, size_t count, if (NULL != sbuf1) free(sbuf1); return OMPI_ERR_OUT_OF_RESOURCE; } - mca_coll_accelerator_memcpy(rbuf1, rbuf, bufsize); + mca_coll_accelerator_memcpy(rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbuf, rbuf_dev, + bufsize, MCA_ACCELERATOR_TRANSFER_DTOH); rbuf2 = rbuf; /* save away original buffer */ rbuf = rbuf1 - gap; } @@ -73,7 +77,8 @@ mca_coll_accelerator_allreduce(const void *sbuf, void *rbuf, size_t count, } if (NULL != rbuf1) { rbuf = rbuf2; - mca_coll_accelerator_memcpy(rbuf, rbuf1, bufsize); + mca_coll_accelerator_memcpy(rbuf, rbuf_dev, rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, bufsize, + MCA_ACCELERATOR_TRANSFER_HTOD); free(rbuf1); } return rc; diff --git a/ompi/mca/coll/accelerator/coll_accelerator_exscan.c b/ompi/mca/coll/accelerator/coll_accelerator_exscan.c index 4933cbcedd3..d77dd5a9b46 100644 --- a/ompi/mca/coll/accelerator/coll_accelerator_exscan.c +++ b/ompi/mca/coll/accelerator/coll_accelerator_exscan.c @@ -5,6 +5,7 @@ * Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved. * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,11 +30,12 @@ int mca_coll_accelerator_exscan(const void *sbuf, void *rbuf, size_t count, mca_coll_accelerator_module_t *s = (mca_coll_accelerator_module_t*) module; ptrdiff_t gap; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; + int sbuf_dev, rbuf_dev; size_t bufsize; int rc; bufsize = opal_datatype_span(&dtype->super, count, &gap); - rc = mca_coll_accelerator_check_buf((void *)sbuf); + rc = mca_coll_accelerator_check_buf((void *)sbuf, &sbuf_dev); if (rc < 0) { return rc; } @@ -43,10 +45,11 @@ int mca_coll_accelerator_exscan(const void *sbuf, void *rbuf, size_t count, if (NULL == sbuf1) { return OMPI_ERR_OUT_OF_RESOURCE; } - mca_coll_accelerator_memcpy(sbuf1, sbuf, bufsize); + mca_coll_accelerator_memcpy(sbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, sbuf, sbuf_dev, bufsize, + MCA_ACCELERATOR_TRANSFER_DTOH); sbuf = sbuf1 - gap; } - rc = mca_coll_accelerator_check_buf(rbuf); + rc = mca_coll_accelerator_check_buf(rbuf, &rbuf_dev); if (rc < 0) { return rc; } @@ -56,7 +59,8 @@ int mca_coll_accelerator_exscan(const void *sbuf, void *rbuf, size_t count, if (NULL != sbuf1) free(sbuf1); return OMPI_ERR_OUT_OF_RESOURCE; } - mca_coll_accelerator_memcpy(rbuf1, rbuf, bufsize); + mca_coll_accelerator_memcpy(rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbuf, rbuf_dev, bufsize, + MCA_ACCELERATOR_TRANSFER_DTOH); rbuf2 = rbuf; /* save away original buffer */ rbuf = rbuf1 - gap; } @@ -68,7 +72,8 @@ int mca_coll_accelerator_exscan(const void *sbuf, void *rbuf, size_t count, } if (NULL != rbuf1) { rbuf = rbuf2; - mca_coll_accelerator_memcpy(rbuf, rbuf1, bufsize); + mca_coll_accelerator_memcpy(rbuf, rbuf_dev, rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, bufsize, + MCA_ACCELERATOR_TRANSFER_HTOD); free(rbuf1); } return rc; diff --git a/ompi/mca/coll/accelerator/coll_accelerator_reduce.c b/ompi/mca/coll/accelerator/coll_accelerator_reduce.c index 993271fa16b..509ea0961d4 100644 --- a/ompi/mca/coll/accelerator/coll_accelerator_reduce.c +++ b/ompi/mca/coll/accelerator/coll_accelerator_reduce.c @@ -6,6 +6,7 @@ * Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved. * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -39,12 +40,13 @@ mca_coll_accelerator_reduce(const void *sbuf, void *rbuf, size_t count, int rank = ompi_comm_rank(comm); ptrdiff_t gap; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; + int rbuf_dev, sbuf_dev; size_t bufsize; int rc; bufsize = opal_datatype_span(&dtype->super, count, &gap); - rc = mca_coll_accelerator_check_buf((void *)sbuf); + rc = mca_coll_accelerator_check_buf((void *)sbuf, &sbuf_dev); if (rc < 0) { return rc; } @@ -53,11 +55,12 @@ mca_coll_accelerator_reduce(const void *sbuf, void *rbuf, size_t count, if (NULL == sbuf1) { return OMPI_ERR_OUT_OF_RESOURCE; } - mca_coll_accelerator_memcpy(sbuf1, sbuf, bufsize); + mca_coll_accelerator_memcpy(sbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, sbuf, sbuf_dev, bufsize, + MCA_ACCELERATOR_TRANSFER_DTOH); sbuf = sbuf1 - gap; } - rc = mca_coll_accelerator_check_buf(rbuf); + rc = mca_coll_accelerator_check_buf(rbuf, &rbuf_dev); if (rc < 0) { return rc; } @@ -67,7 +70,8 @@ mca_coll_accelerator_reduce(const void *sbuf, void *rbuf, size_t count, if (NULL != sbuf1) free(sbuf1); return OMPI_ERR_OUT_OF_RESOURCE; } - mca_coll_accelerator_memcpy(rbuf1, rbuf, bufsize); + mca_coll_accelerator_memcpy(rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbuf, rbuf_dev, bufsize, + MCA_ACCELERATOR_TRANSFER_DTOH); rbuf2 = rbuf; /* save away original buffer */ rbuf = rbuf1 - gap; } @@ -80,7 +84,8 @@ mca_coll_accelerator_reduce(const void *sbuf, void *rbuf, size_t count, } if (NULL != rbuf1) { rbuf = rbuf2; - mca_coll_accelerator_memcpy(rbuf, rbuf1, bufsize); + mca_coll_accelerator_memcpy(rbuf, rbuf_dev, rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, bufsize, + MCA_ACCELERATOR_TRANSFER_HTOD); free(rbuf1); } return rc; @@ -94,12 +99,13 @@ mca_coll_accelerator_reduce_local(const void *sbuf, void *rbuf, size_t count, { ptrdiff_t gap; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; + int sbuf_dev, rbuf_dev; size_t bufsize; int rc; bufsize = opal_datatype_span(&dtype->super, count, &gap); - rc = mca_coll_accelerator_check_buf((void *)sbuf); + rc = mca_coll_accelerator_check_buf((void *)sbuf, &sbuf_dev); if (rc < 0) { return rc; } @@ -109,11 +115,12 @@ mca_coll_accelerator_reduce_local(const void *sbuf, void *rbuf, size_t count, if (NULL == sbuf1) { return OMPI_ERR_OUT_OF_RESOURCE; } - mca_coll_accelerator_memcpy(sbuf1, sbuf, bufsize); + mca_coll_accelerator_memcpy(sbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, sbuf, sbuf_dev, bufsize, + MCA_ACCELERATOR_TRANSFER_DTOH); sbuf = sbuf1 - gap; } - rc = mca_coll_accelerator_check_buf(rbuf); + rc = mca_coll_accelerator_check_buf(rbuf, &rbuf_dev); if (rc < 0) { return rc; } @@ -124,7 +131,8 @@ mca_coll_accelerator_reduce_local(const void *sbuf, void *rbuf, size_t count, if (NULL != sbuf1) free(sbuf1); return OMPI_ERR_OUT_OF_RESOURCE; } - mca_coll_accelerator_memcpy(rbuf1, rbuf, bufsize); + mca_coll_accelerator_memcpy(rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbuf, rbuf_dev, bufsize, + MCA_ACCELERATOR_TRANSFER_DTOH); rbuf2 = rbuf; /* save away original buffer */ rbuf = rbuf1 - gap; } @@ -137,7 +145,8 @@ mca_coll_accelerator_reduce_local(const void *sbuf, void *rbuf, size_t count, } if (NULL != rbuf1) { rbuf = rbuf2; - mca_coll_accelerator_memcpy(rbuf, rbuf1, bufsize); + mca_coll_accelerator_memcpy(rbuf, rbuf_dev, rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, bufsize, + MCA_ACCELERATOR_TRANSFER_HTOD); free(rbuf1); } return rc; diff --git a/ompi/mca/coll/accelerator/coll_accelerator_reduce_scatter_block.c b/ompi/mca/coll/accelerator/coll_accelerator_reduce_scatter_block.c index 9dc27b61601..5f0fd61914f 100644 --- a/ompi/mca/coll/accelerator/coll_accelerator_reduce_scatter_block.c +++ b/ompi/mca/coll/accelerator/coll_accelerator_reduce_scatter_block.c @@ -5,6 +5,7 @@ * Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved. * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,13 +42,14 @@ mca_coll_accelerator_reduce_scatter_block(const void *sbuf, void *rbuf, size_t r mca_coll_accelerator_module_t *s = (mca_coll_accelerator_module_t*) module; ptrdiff_t gap; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; + int sbuf_dev, rbuf_dev; size_t sbufsize, rbufsize; int rc; rbufsize = opal_datatype_span(&dtype->super, rcount, &gap); sbufsize = rbufsize * ompi_comm_size(comm); - rc = mca_coll_accelerator_check_buf((void *)sbuf); + rc = mca_coll_accelerator_check_buf((void *)sbuf, &sbuf_dev); if (rc < 0) { return rc; } @@ -56,10 +58,11 @@ mca_coll_accelerator_reduce_scatter_block(const void *sbuf, void *rbuf, size_t r if (NULL == sbuf1) { return OMPI_ERR_OUT_OF_RESOURCE; } - mca_coll_accelerator_memcpy(sbuf1, sbuf, sbufsize); + mca_coll_accelerator_memcpy(sbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, sbuf, sbuf_dev, sbufsize, + MCA_ACCELERATOR_TRANSFER_DTOH); sbuf = sbuf1 - gap; } - rc = mca_coll_accelerator_check_buf(rbuf); + rc = mca_coll_accelerator_check_buf(rbuf, &rbuf_dev); if (rc < 0) { return rc; } @@ -69,7 +72,8 @@ mca_coll_accelerator_reduce_scatter_block(const void *sbuf, void *rbuf, size_t r if (NULL != sbuf1) free(sbuf1); return OMPI_ERR_OUT_OF_RESOURCE; } - mca_coll_accelerator_memcpy(rbuf1, rbuf, rbufsize); + mca_coll_accelerator_memcpy(rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbuf, rbuf_dev, rbufsize, + MCA_ACCELERATOR_TRANSFER_DTOH); rbuf2 = rbuf; /* save away original buffer */ rbuf = rbuf1 - gap; } @@ -80,7 +84,8 @@ mca_coll_accelerator_reduce_scatter_block(const void *sbuf, void *rbuf, size_t r } if (NULL != rbuf1) { rbuf = rbuf2; - mca_coll_accelerator_memcpy(rbuf, rbuf1, rbufsize); + mca_coll_accelerator_memcpy(rbuf, rbuf_dev, rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbufsize, + MCA_ACCELERATOR_TRANSFER_HTOD); free(rbuf1); } return rc; diff --git a/ompi/mca/coll/accelerator/coll_accelerator_scan.c b/ompi/mca/coll/accelerator/coll_accelerator_scan.c index ef7ac86971a..7018744c1c8 100644 --- a/ompi/mca/coll/accelerator/coll_accelerator_scan.c +++ b/ompi/mca/coll/accelerator/coll_accelerator_scan.c @@ -5,6 +5,7 @@ * Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved. * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -36,11 +37,12 @@ int mca_coll_accelerator_scan(const void *sbuf, void *rbuf, size_t count, mca_coll_accelerator_module_t *s = (mca_coll_accelerator_module_t*) module; ptrdiff_t gap; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; + int sbuf_dev, rbuf_dev; size_t bufsize; int rc; bufsize = opal_datatype_span(&dtype->super, count, &gap); - rc = mca_coll_accelerator_check_buf((void *)sbuf); + rc = mca_coll_accelerator_check_buf((void *)sbuf, &sbuf_dev); if (rc < 0) { return rc; } @@ -49,10 +51,11 @@ int mca_coll_accelerator_scan(const void *sbuf, void *rbuf, size_t count, if (NULL == sbuf1) { return OMPI_ERR_OUT_OF_RESOURCE; } - mca_coll_accelerator_memcpy(sbuf1, sbuf, bufsize); + mca_coll_accelerator_memcpy(sbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, sbuf, sbuf_dev, bufsize, + MCA_ACCELERATOR_TRANSFER_DTOH); sbuf = sbuf1 - gap; } - rc = mca_coll_accelerator_check_buf(rbuf); + rc = mca_coll_accelerator_check_buf(rbuf, &rbuf_dev); if (rc < 0) { return rc; } @@ -62,7 +65,8 @@ int mca_coll_accelerator_scan(const void *sbuf, void *rbuf, size_t count, if (NULL != sbuf1) free(sbuf1); return OMPI_ERR_OUT_OF_RESOURCE; } - mca_coll_accelerator_memcpy(rbuf1, rbuf, bufsize); + mca_coll_accelerator_memcpy(rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbuf, rbuf_dev, bufsize, + MCA_ACCELERATOR_TRANSFER_DTOH); rbuf2 = rbuf; /* save away original buffer */ rbuf = rbuf1 - gap; } @@ -73,7 +77,8 @@ int mca_coll_accelerator_scan(const void *sbuf, void *rbuf, size_t count, } if (NULL != rbuf1) { rbuf = rbuf2; - mca_coll_accelerator_memcpy(rbuf, rbuf1, bufsize); + mca_coll_accelerator_memcpy(rbuf, rbuf_dev, rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, bufsize, + MCA_ACCELERATOR_TRANSFER_HTOD); free(rbuf1); } return rc;