Skip to content

Commit 8b24867

Browse files
committed
fcoll/vulcan: add support for GPU aggregation buffers
If the user user input buffers are GPU device memory, use also GPU device memory for the aggregation step. This will allow the data transfer to occur between GPU buffers, and hence take advantage of the much higher GPU-GPU interconnects (e.g. XGMI, NVLINK, etc.). The downside of this approach is that we cannot call directly into the fbtl ipwritev routine, but have to go through the common_ompio_file_iwrite_pregen routine, which performs the necessary segmenting and staging through the host memory. Signed-off-by: Edgar Gabriel <[email protected]>
1 parent a6d5b36 commit 8b24867

File tree

7 files changed

+149
-26
lines changed

7 files changed

+149
-26
lines changed

ompi/mca/common/ompio/common_ompio.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,8 @@ OMPI_DECLSPEC int mca_common_ompio_file_write_at (ompio_file_t *fh, OMPI_MPI_OFF
262262
OMPI_DECLSPEC int mca_common_ompio_file_iwrite (ompio_file_t *fh, const void *buf, size_t count,
263263
struct ompi_datatype_t *datatype, ompi_request_t **request);
264264

265+
OMPI_DECLSPEC int mca_common_ompio_file_iwrite_pregen (ompio_file_t *fh, ompi_request_t *request);
266+
265267
OMPI_DECLSPEC int mca_common_ompio_file_iwrite_at (ompio_file_t *fh, OMPI_MPI_OFFSET_TYPE offset,
266268
const void *buf, size_t count, struct ompi_datatype_t *datatype,
267269
ompi_request_t **request);

ompi/mca/common/ompio/common_ompio_file_write.c

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Copyright (c) 2008-2019 University of Houston. All rights reserved.
1313
* Copyright (c) 2015-2018 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
15-
* Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
15+
* Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
1616
* Copyright (c) 2024 Triad National Security, LLC. All rights
1717
* reserved.
1818
* $COPYRIGHT$
@@ -329,6 +329,7 @@ static void mca_common_ompio_post_next_write_subreq(struct mca_ompio_request_t *
329329
decoded_iov.iov_base = req->req_tbuf;
330330
decoded_iov.iov_len = req->req_size;
331331
opal_convertor_pack (&req->req_convertor, &decoded_iov, &iov_count, &pos);
332+
332333
mca_common_ompio_build_io_array (req->req_fview, index, req->req_num_subreqs,
333334
bytes_per_cycle, pos,
334335
iov_count, &decoded_iov,
@@ -472,6 +473,72 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh,
472473
return ret;
473474
}
474475

476+
/*
477+
** This routine is invoked from the fcoll component.
478+
** It is only used if the temporary buffer is a gpu buffer,
479+
** and the fbtl supports the ipwritev operation.
480+
**
481+
** The io-array has already been generated in fcoll/xxx/file_write_all,
482+
** and we use the pre-computed offsets to created a pseudo fview.
483+
** The position of the file pointer is updated in the fcoll
484+
** component, not here.
485+
*/
486+
487+
int mca_common_ompio_file_iwrite_pregen (ompio_file_t *fh,
488+
ompi_request_t *request)
489+
{
490+
uint32_t i;
491+
size_t max_data;
492+
size_t pipeline_buf_size;
493+
mca_ompio_request_t *ompio_req = (mca_ompio_request_t *) request;
494+
495+
if (NULL == fh->f_fbtl->fbtl_ipwritev) {
496+
return MPI_ERR_INTERN;
497+
}
498+
499+
max_data = fh->f_io_array[0].length;
500+
pipeline_buf_size = OMPIO_MCA_GET(fh, pipeline_buffer_size);
501+
502+
mca_common_ompio_register_progress ();
503+
504+
OMPIO_PREPARE_BUF (fh, fh->f_io_array[0].memory_address, max_data, MPI_BYTE,
505+
ompio_req->req_tbuf, &ompio_req->req_convertor, max_data,
506+
pipeline_buf_size, NULL, i);
507+
508+
ompio_req->req_num_subreqs = ceil((double)max_data/pipeline_buf_size);
509+
ompio_req->req_size = pipeline_buf_size;
510+
ompio_req->req_max_data = max_data;
511+
ompio_req->req_post_next_subreq = mca_common_ompio_post_next_write_subreq;
512+
ompio_req->req_fh = fh;
513+
ompio_req->req_ompi.req_status.MPI_ERROR = MPI_SUCCESS;
514+
515+
ompio_req->req_fview = (struct ompio_fview_t *) calloc(1, sizeof(struct ompio_fview_t));
516+
if (NULL == ompio_req->req_fview) {
517+
opal_output(1, "common_ompio: error allocating memory\n");
518+
return OMPI_ERR_OUT_OF_RESOURCE;
519+
}
520+
521+
ompio_req->req_fview->f_decoded_iov = (struct iovec*) malloc ( fh->f_num_of_io_entries *
522+
sizeof(struct iovec));
523+
if (NULL == ompio_req->req_fview->f_decoded_iov) {
524+
opal_output(1, "common_ompio_file_iwrite_pregen: could not allocate memory\n");
525+
return OMPI_ERR_OUT_OF_RESOURCE;
526+
}
527+
528+
ompio_req->req_fview->f_iov_count = fh->f_num_of_io_entries;
529+
for (i=0; i < ompio_req->req_fview->f_iov_count; i++) {
530+
ompio_req->req_fview->f_decoded_iov[i].iov_base = fh->f_io_array[i].offset;
531+
ompio_req->req_fview->f_decoded_iov[i].iov_len = fh->f_io_array[i].length ;
532+
}
533+
534+
fh->f_num_of_io_entries = 0;
535+
free (fh->f_io_array);
536+
fh->f_io_array = NULL;
537+
538+
mca_common_ompio_post_next_write_subreq(ompio_req, 0);
539+
return OMPI_SUCCESS;
540+
}
541+
475542
int mca_common_ompio_file_iwrite_at (ompio_file_t *fh,
476543
OMPI_MPI_OFFSET_TYPE offset,
477544
const void *buf,

ompi/mca/fcoll/vulcan/fcoll_vulcan.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ extern int mca_fcoll_vulcan_priority;
4343
extern int mca_fcoll_vulcan_num_groups;
4444
extern int mca_fcoll_vulcan_write_chunksize;
4545
extern int mca_fcoll_vulcan_async_io;
46+
extern int mca_fcoll_vulcan_use_accelerator_buffers;
4647

4748
OMPI_DECLSPEC extern mca_fcoll_base_component_3_0_0_t mca_fcoll_vulcan_component;
4849

ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c

Lines changed: 62 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved.
1616
* Copyright (c) 2024 Triad National Security, LLC. All rights
1717
* reserved.
18+
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
1819
* $COPYRIGHT$
1920
*
2021
* Additional copyrights may follow
@@ -30,10 +31,12 @@
3031
#include "ompi/mca/fcoll/fcoll.h"
3132
#include "ompi/mca/fcoll/base/fcoll_base_coll_array.h"
3233
#include "ompi/mca/common/ompio/common_ompio.h"
34+
#include "ompi/mca/common/ompio/common_ompio_buffer.h"
3335
#include "ompi/mca/io/io.h"
3436
#include "ompi/mca/common/ompio/common_ompio_request.h"
3537
#include "math.h"
3638
#include "ompi/mca/pml/pml.h"
39+
#include "opal/mca/accelerator/accelerator.h"
3740
#include <unistd.h>
3841

3942
#define DEBUG_ON 0
@@ -88,13 +91,12 @@ typedef struct mca_io_ompio_aggregator_data {
8891
_aggr[_i]->prev_recvtype=(ompi_datatype_t **)_t; } \
8992
}
9093

91-
92-
9394
static int shuffle_init ( int index, int cycles, int aggregator, int rank,
9495
mca_io_ompio_aggregator_data *data,
9596
ompi_request_t **reqs );
9697
static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data,
97-
int write_chunksize, int write_synchType, ompi_request_t **request);
98+
int write_chunksize, int write_synchType, ompi_request_t **request,
99+
bool is_accelerator_buffer);
98100
int mca_fcoll_vulcan_break_file_view ( struct iovec *decoded_iov, int iov_count,
99101
struct iovec *local_iov_array, int local_count,
100102
struct iovec ***broken_decoded_iovs, int **broken_iov_counts,
@@ -155,6 +157,8 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
155157

156158
ompi_count_array_t fview_count_desc;
157159
ompi_disp_array_t displs_desc;
160+
int is_gpu, is_managed;
161+
bool use_accelerator_buffer = false;
158162

159163
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
160164
double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
@@ -180,6 +184,11 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
180184
goto exit;
181185
}
182186

187+
mca_common_ompio_check_gpu_buf (fh, buf, &is_gpu, &is_managed);
188+
if (is_gpu && !is_managed &&
189+
fh->f_get_mca_parameter_value ("use_accelerator_buffers", strlen("use_accelerator_buffers"))) {
190+
use_accelerator_buffer = true;
191+
}
183192
/* since we want to overlap 2 iterations, define the bytes_per_cycle to be half of what
184193
the user requested */
185194
bytes_per_cycle =bytes_per_cycle/2;
@@ -529,13 +538,31 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
529538
goto exit;
530539
}
531540

532-
533-
aggr_data[i]->global_buf = (char *) malloc (bytes_per_cycle);
534-
aggr_data[i]->prev_global_buf = (char *) malloc (bytes_per_cycle);
535-
if (NULL == aggr_data[i]->global_buf || NULL == aggr_data[i]->prev_global_buf){
536-
opal_output(1, "OUT OF MEMORY");
537-
ret = OMPI_ERR_OUT_OF_RESOURCE;
538-
goto exit;
541+
if (use_accelerator_buffer) {
542+
opal_output_verbose(10, ompi_fcoll_base_framework.framework_output,
543+
"Allocating GPU device buffer for aggregation\n");
544+
ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&aggr_data[i]->global_buf,
545+
bytes_per_cycle);
546+
if (OPAL_SUCCESS != ret) {
547+
opal_output(1, "Could not allocate accelerator memory");
548+
ret = OMPI_ERR_OUT_OF_RESOURCE;
549+
goto exit;
550+
}
551+
ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&aggr_data[i]->prev_global_buf,
552+
bytes_per_cycle);
553+
if (OPAL_SUCCESS != ret) {
554+
opal_output(1, "Could not allocate accelerator memory");
555+
ret = OMPI_ERR_OUT_OF_RESOURCE;
556+
goto exit;
557+
}
558+
} else {
559+
aggr_data[i]->global_buf = (char *) malloc (bytes_per_cycle);
560+
aggr_data[i]->prev_global_buf = (char *) malloc (bytes_per_cycle);
561+
if (NULL == aggr_data[i]->global_buf || NULL == aggr_data[i]->prev_global_buf){
562+
opal_output(1, "OUT OF MEMORY");
563+
ret = OMPI_ERR_OUT_OF_RESOURCE;
564+
goto exit;
565+
}
539566
}
540567

541568
aggr_data[i]->recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group *
@@ -605,7 +632,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
605632
start_write_time = MPI_Wtime();
606633
#endif
607634
ret = write_init (fh, fh->f_aggr_list[aggr_index], aggr_data[aggr_index],
608-
write_chunksize, write_synch_type, &req_iwrite);
635+
write_chunksize, write_synch_type, &req_iwrite, use_accelerator_buffer);
609636
if (OMPI_SUCCESS != ret){
610637
goto exit;
611638
}
@@ -645,7 +672,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
645672
start_write_time = MPI_Wtime();
646673
#endif
647674
ret = write_init (fh, fh->f_aggr_list[aggr_index], aggr_data[aggr_index],
648-
write_chunksize, write_synch_type, &req_iwrite);
675+
write_chunksize, write_synch_type, &req_iwrite, use_accelerator_buffer);
649676
if (OMPI_SUCCESS != ret){
650677
goto exit;
651678
}
@@ -704,8 +731,13 @@ exit :
704731

705732
free (aggr_data[i]->disp_index);
706733
free (aggr_data[i]->max_disp_index);
707-
free (aggr_data[i]->global_buf);
708-
free (aggr_data[i]->prev_global_buf);
734+
if (use_accelerator_buffer) {
735+
opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->global_buf);
736+
opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->prev_global_buf);
737+
} else {
738+
free (aggr_data[i]->global_buf);
739+
free (aggr_data[i]->prev_global_buf);
740+
}
709741
for(l=0;l<aggr_data[i]->procs_per_group;l++){
710742
free (aggr_data[i]->blocklen_per_process[l]);
711743
free (aggr_data[i]->displs_per_process[l]);
@@ -749,7 +781,8 @@ static int write_init (ompio_file_t *fh,
749781
mca_io_ompio_aggregator_data *aggr_data,
750782
int write_chunksize,
751783
int write_synchType,
752-
ompi_request_t **request )
784+
ompi_request_t **request,
785+
bool is_accelerator_buffer)
753786
{
754787
int ret = OMPI_SUCCESS;
755788
ssize_t ret_temp = 0;
@@ -770,11 +803,20 @@ static int write_init (ompio_file_t *fh,
770803
write_chunksize);
771804

772805
if (1 == write_synchType) {
773-
ret = fh->f_fbtl->fbtl_ipwritev(fh, (ompi_request_t *) ompio_req);
774-
if(0 > ret) {
775-
opal_output (1, "vulcan_write_all: fbtl_ipwritev failed\n");
776-
ompio_req->req_ompi.req_status.MPI_ERROR = ret;
777-
ompio_req->req_ompi.req_status._ucount = 0;
806+
if (is_accelerator_buffer) {
807+
ret = mca_common_ompio_file_iwrite_pregen(fh, (ompi_request_t *) ompio_req);
808+
if(0 > ret) {
809+
opal_output (1, "vulcan_write_all: mca_common_ompio_iwrite_pregen failed\n");
810+
ompio_req->req_ompi.req_status.MPI_ERROR = ret;
811+
ompio_req->req_ompi.req_status._ucount = 0;
812+
}
813+
} else {
814+
ret = fh->f_fbtl->fbtl_ipwritev(fh, (ompi_request_t *) ompio_req);
815+
if(0 > ret) {
816+
opal_output (1, "vulcan_write_all: fbtl_ipwritev failed\n");
817+
ompio_req->req_ompi.req_status.MPI_ERROR = ret;
818+
ompio_req->req_ompi.req_status._ucount = 0;
819+
}
778820
}
779821
}
780822
else {

ompi/mca/io/ompio/io_ompio.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* Copyright (c) 2012-2013 Inria. All rights reserved.
1616
* Copyright (c) 2015-2018 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
18-
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
18+
* Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
1919
* $COPYRIGHT$
2020
*
2121
* Additional copyrights may follow
@@ -556,6 +556,9 @@ int mca_io_ompio_get_mca_parameter_value ( char *mca_parameter_name, int name_le
556556
else if ( !strncmp ( mca_parameter_name, "coll_timing_info", name_length )) {
557557
return mca_io_ompio_coll_timing_info;
558558
}
559+
else if ( !strncmp (mca_parameter_name, "use_accelerator_buffers", name_length)) {
560+
return mca_io_ompio_use_accelerator_buffers;
561+
}
559562
else {
560563
opal_output (1, "Error in mca_io_ompio_get_mca_parameter_value: unknown parameter name");
561564
}

ompi/mca/io/ompio/io_ompio.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2015-2018 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
1616
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
17-
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
17+
* Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
1818
* Copyright (c) 2024 Triad National Security, LLC. All rights
1919
* reserved.
2020
* $COPYRIGHT$
@@ -57,7 +57,7 @@ extern int mca_io_ompio_max_aggregators_ratio;
5757
extern int mca_io_ompio_aggregators_cutoff_threshold;
5858
extern int mca_io_ompio_overwrite_amode;
5959
extern int mca_io_ompio_verbose_info_parsing;
60-
60+
extern int mca_io_ompio_use_accelerator_buffers;
6161
OMPI_DECLSPEC extern int mca_io_ompio_coll_timing_info;
6262

6363
#define QUEUESIZE 2048

ompi/mca/io/ompio/io_ompio_component.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* and Technology (RIST). All rights reserved.
1818
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
1919
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
20-
* Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
20+
* Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
2121
* Copyright (c) 2024 Triad National Security, LLC. All rights
2222
* reserved.
2323
* $COPYRIGHT$
@@ -49,7 +49,7 @@ int mca_io_ompio_max_aggregators_ratio=8;
4949
int mca_io_ompio_aggregators_cutoff_threshold=3;
5050
int mca_io_ompio_overwrite_amode = 1;
5151
int mca_io_ompio_verbose_info_parsing = 0;
52-
52+
int mca_io_ompio_use_accelerator_buffers = 1;
5353
int mca_io_ompio_grouping_option=5;
5454

5555
/*
@@ -263,6 +263,14 @@ static int register_component(void)
263263
MCA_BASE_VAR_SCOPE_READONLY,
264264
&mca_io_ompio_verbose_info_parsing);
265265

266+
mca_io_ompio_use_accelerator_buffers = 1;
267+
(void) mca_base_component_var_register(&mca_io_ompio_component.io_version,
268+
"use_accelerator_buffers", "Allow using accelerator buffers"
269+
"for data aggregation in collective I/O if input buffer is device memory",
270+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
271+
OPAL_INFO_LVL_9,
272+
MCA_BASE_VAR_SCOPE_READONLY, &mca_io_ompio_use_accelerator_buffers);
273+
266274
return OMPI_SUCCESS;
267275
}
268276

0 commit comments

Comments
 (0)