Skip to content

Commit 2e83cf1

Browse files
committed
Add support for GPU buffers for PSM2 MTL
PSM2 enables support for GPU buffers and CUDA managed memory and it can directly recognize GPU buffers, handle copies between HFIs and GPUs. Therefore, it is not required for OMPI to handle GPU buffers for pt2pt cases. In this patch, we allow the PSM2 MTL to specify when it does not require CUDA convertor support. This allows us to skip CUDA convertor init phases and lets PSM2 handle the memory transfers. This translates to improvements in latency. The patch enables blocking collectives and workloads with GPU contiguous, GPU non-contiguous memory. Signed-off-by: Aravind Gopalakrishnan <[email protected]>
1 parent 79fc9d5 commit 2e83cf1

File tree

8 files changed

+116
-32
lines changed

8 files changed

+116
-32
lines changed

ompi/mca/mtl/mtl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
66
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
77
* reserved.
8+
* Copyright (c) 2017 Intel, Inc. All rights reserved
89
* $COPYRIGHT$
910
*
1011
* Additional copyrights may follow
@@ -61,6 +62,9 @@ typedef struct mca_mtl_request_t mca_mtl_request_t;
6162
* MTL module flags
6263
*/
6364
#define MCA_MTL_BASE_FLAG_REQUIRE_WORLD 0x00000001
65+
#if OPAL_CUDA_SUPPORT
66+
#define MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE 0x00000002
67+
#endif
6468

6569
/**
6670
* Initialization routine for MTL component

ompi/mca/mtl/psm2/mtl_psm2.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2006 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2006 QLogic Corporation. All rights reserved.
14-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
14+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
1515
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
1616
* reserved.
1717
* Copyright (c) 2016 Research Organization for Information Science
@@ -100,6 +100,9 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
100100
char *generated_key;
101101
char env_string[256];
102102
int rc;
103+
#if OPAL_CUDA_SUPPORT
104+
char *cuda_env;
105+
#endif
103106

104107
generated_key = getenv(OPAL_MCA_PREFIX"orte_precondition_transports");
105108
memset(uu, 0, sizeof(psm2_uuid_t));
@@ -173,6 +176,15 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
173176
/* register the psm2 progress function */
174177
opal_progress_register(ompi_mtl_psm2_progress);
175178

179+
#if OPAL_CUDA_SUPPORT
180+
ompi_mtl_psm2.super.mtl_flags |= MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE;
181+
182+
cuda_env = getenv("PSM2_CUDA");
183+
if (!cuda_env || ( strcmp(cuda_env, "0") == 0) )
184+
opal_output(0, "Warning: If running with device buffers, there is a"
185+
" chance the application might fail. Try setting PSM2_CUDA=1.\n");
186+
#endif
187+
176188
return OMPI_SUCCESS;
177189
}
178190

ompi/mca/pml/cm/pml_cm.h

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* reserved.
77
* Copyright (c) 2015 Research Organization for Information Science
88
* and Technology (RIST). All rights reserved.
9+
* Copyright (c) 2017 Intel, Inc. All rights reserved
910
* $COPYRIGHT$
1011
*
1112
* Additional copyrights may follow
@@ -79,6 +80,7 @@ mca_pml_cm_irecv_init(void *addr,
7980
struct ompi_request_t **request)
8081
{
8182
mca_pml_cm_hvy_recv_request_t *recvreq;
83+
uint32_t flags = 0;
8284
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
8385
ompi_proc_t* ompi_proc;
8486
#endif
@@ -87,7 +89,7 @@ mca_pml_cm_irecv_init(void *addr,
8789
if( OPAL_UNLIKELY(NULL == recvreq) ) return OMPI_ERR_OUT_OF_RESOURCE;
8890

8991
MCA_PML_CM_HVY_RECV_REQUEST_INIT(recvreq, ompi_proc, comm, tag, src,
90-
datatype, addr, count, true);
92+
datatype, addr, count, flags, true);
9193

9294
*request = (ompi_request_t*) recvreq;
9395

@@ -104,6 +106,7 @@ mca_pml_cm_irecv(void *addr,
104106
struct ompi_request_t **request)
105107
{
106108
int ret;
109+
uint32_t flags = 0;
107110
mca_pml_cm_thin_recv_request_t *recvreq;
108111
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
109112
ompi_proc_t* ompi_proc = NULL;
@@ -118,7 +121,8 @@ mca_pml_cm_irecv(void *addr,
118121
src,
119122
datatype,
120123
addr,
121-
count);
124+
count,
125+
flags);
122126

123127
MCA_PML_CM_THIN_RECV_REQUEST_START(recvreq, comm, tag, src, ret);
124128

@@ -145,6 +149,7 @@ mca_pml_cm_recv(void *addr,
145149
ompi_status_public_t * status)
146150
{
147151
int ret;
152+
uint32_t flags = 0;
148153
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
149154
ompi_proc_t *ompi_proc;
150155
#endif
@@ -173,20 +178,24 @@ mca_pml_cm_recv(void *addr,
173178
ompi_proc = ompi_comm_peer_lookup( comm, src );
174179
}
175180

181+
MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
182+
176183
opal_convertor_copy_and_prepare_for_recv(
177184
ompi_proc->super.proc_convertor,
178185
&(datatype->super),
179186
count,
180187
addr,
181-
0,
188+
flags,
182189
&convertor );
183190
#else
191+
MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
192+
184193
opal_convertor_copy_and_prepare_for_recv(
185194
ompi_mpi_local_convertor,
186195
&(datatype->super),
187196
count,
188197
addr,
189-
0,
198+
flags,
190199
&convertor );
191200
#endif
192201

@@ -222,6 +231,7 @@ mca_pml_cm_isend_init(const void* buf,
222231
ompi_request_t** request)
223232
{
224233
mca_pml_cm_hvy_send_request_t *sendreq;
234+
uint32_t flags = 0;
225235
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
226236
ompi_proc_t* ompi_proc;
227237
#endif
@@ -230,7 +240,7 @@ mca_pml_cm_isend_init(const void* buf,
230240
if (OPAL_UNLIKELY(NULL == sendreq)) return OMPI_ERR_OUT_OF_RESOURCE;
231241

232242
MCA_PML_CM_HVY_SEND_REQUEST_INIT(sendreq, ompi_proc, comm, tag, dst,
233-
datatype, sendmode, true, false, buf, count);
243+
datatype, sendmode, true, false, buf, count, flags);
234244

235245
/* Work around a leak in start by marking this request as complete. The
236246
* problem occured because we do not have a way to differentiate an
@@ -254,6 +264,7 @@ mca_pml_cm_isend(const void* buf,
254264
ompi_request_t** request)
255265
{
256266
int ret;
267+
uint32_t flags = 0;
257268

258269
if(sendmode == MCA_PML_BASE_SEND_BUFFERED ) {
259270
mca_pml_cm_hvy_send_request_t* sendreq;
@@ -274,7 +285,8 @@ mca_pml_cm_isend(const void* buf,
274285
false,
275286
false,
276287
buf,
277-
count);
288+
count,
289+
flags);
278290

279291
MCA_PML_CM_HVY_SEND_REQUEST_START( sendreq, ret);
280292

@@ -296,7 +308,8 @@ mca_pml_cm_isend(const void* buf,
296308
datatype,
297309
sendmode,
298310
buf,
299-
count);
311+
count,
312+
flags);
300313

301314
MCA_PML_CM_THIN_SEND_REQUEST_START(
302315
sendreq,
@@ -324,6 +337,7 @@ mca_pml_cm_send(const void *buf,
324337
ompi_communicator_t* comm)
325338
{
326339
int ret = OMPI_ERROR;
340+
uint32_t flags = 0;
327341
ompi_proc_t * ompi_proc;
328342

329343
if(sendmode == MCA_PML_BASE_SEND_BUFFERED) {
@@ -342,7 +356,8 @@ mca_pml_cm_send(const void *buf,
342356
false,
343357
false,
344358
buf,
345-
count);
359+
count,
360+
flags);
346361
MCA_PML_CM_HVY_SEND_REQUEST_START(sendreq, ret);
347362
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
348363
MCA_PML_CM_HVY_SEND_REQUEST_RETURN(sendreq);
@@ -368,9 +383,12 @@ mca_pml_cm_send(const void *buf,
368383
#endif
369384
{
370385
ompi_proc = ompi_comm_peer_lookup(comm, dst);
386+
387+
MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
388+
371389
opal_convertor_copy_and_prepare_for_send(
372390
ompi_proc->super.proc_convertor,
373-
&datatype->super, count, buf, 0,
391+
&datatype->super, count, buf, flags,
374392
&convertor);
375393
}
376394

@@ -459,6 +477,7 @@ mca_pml_cm_imrecv(void *buf,
459477
struct ompi_request_t **request)
460478
{
461479
int ret;
480+
uint32_t flags = 0;
462481
mca_pml_cm_thin_recv_request_t *recvreq;
463482
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
464483
ompi_proc_t* ompi_proc;
@@ -474,7 +493,8 @@ mca_pml_cm_imrecv(void *buf,
474493
(*message)->peer,
475494
datatype,
476495
buf,
477-
count);
496+
count,
497+
flags);
478498

479499
MCA_PML_CM_THIN_RECV_REQUEST_MATCHED_START(recvreq, message, ret);
480500

@@ -491,6 +511,7 @@ mca_pml_cm_mrecv(void *buf,
491511
ompi_status_public_t* status)
492512
{
493513
int ret;
514+
uint32_t flags = 0;
494515
mca_pml_cm_thin_recv_request_t *recvreq;
495516
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
496517
ompi_proc_t* ompi_proc;
@@ -506,7 +527,8 @@ mca_pml_cm_mrecv(void *buf,
506527
(*message)->peer,
507528
datatype,
508529
buf,
509-
count);
530+
count,
531+
flags);
510532

511533
MCA_PML_CM_THIN_RECV_REQUEST_MATCHED_START(recvreq,
512534
message, ret);

ompi/mca/pml/cm/pml_cm_recvreq.h

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
1414
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1515
* reserved.
16+
* Copyright (c) 2017 Intel, Inc. All rights reserved
1617
* $COPYRIGHT$
1718
*
1819
* Additional copyrights may follow
@@ -92,7 +93,8 @@ do { \
9293
src, \
9394
datatype, \
9495
addr, \
95-
count ) \
96+
count, \
97+
flags ) \
9698
do { \
9799
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false); \
98100
(request)->req_base.req_ompi.req_mpi_object.comm = comm; \
@@ -108,12 +110,13 @@ do { \
108110
} else { \
109111
ompi_proc = ompi_comm_peer_lookup( comm, src ); \
110112
} \
113+
MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count); \
111114
opal_convertor_copy_and_prepare_for_recv( \
112115
ompi_proc->super.proc_convertor, \
113116
&(datatype->super), \
114117
count, \
115118
addr, \
116-
0, \
119+
flags, \
117120
&(request)->req_base.req_convertor ); \
118121
} while(0)
119122
#else
@@ -123,7 +126,8 @@ do { \
123126
src, \
124127
datatype, \
125128
addr, \
126-
count ) \
129+
count, \
130+
flags ) \
127131
do { \
128132
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false); \
129133
(request)->req_base.req_ompi.req_mpi_object.comm = comm; \
@@ -134,12 +138,13 @@ do { \
134138
OBJ_RETAIN(comm); \
135139
OMPI_DATATYPE_RETAIN(datatype); \
136140
\
141+
MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count); \
137142
opal_convertor_copy_and_prepare_for_recv( \
138143
ompi_mpi_local_convertor, \
139144
&(datatype->super), \
140145
count, \
141146
addr, \
142-
0, \
147+
flags, \
143148
&(request)->req_base.req_convertor ); \
144149
} while(0)
145150
#endif
@@ -153,6 +158,7 @@ do { \
153158
datatype, \
154159
addr, \
155160
count, \
161+
flags, \
156162
persistent) \
157163
do { \
158164
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent); \
@@ -173,12 +179,13 @@ do { \
173179
} else { \
174180
ompi_proc = ompi_comm_peer_lookup( comm, src ); \
175181
} \
182+
MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count); \
176183
opal_convertor_copy_and_prepare_for_recv( \
177184
ompi_proc->super.proc_convertor, \
178185
&(datatype->super), \
179186
count, \
180187
addr, \
181-
0, \
188+
flags, \
182189
&(request)->req_base.req_convertor ); \
183190
} while(0)
184191
#else
@@ -190,6 +197,7 @@ do { \
190197
datatype, \
191198
addr, \
192199
count, \
200+
flags, \
193201
persistent) \
194202
do { \
195203
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent); \
@@ -205,12 +213,13 @@ do { \
205213
OBJ_RETAIN(comm); \
206214
OMPI_DATATYPE_RETAIN(datatype); \
207215
\
216+
MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count); \
208217
opal_convertor_copy_and_prepare_for_recv( \
209218
ompi_mpi_local_convertor, \
210219
&(datatype->super), \
211220
count, \
212221
addr, \
213-
0, \
222+
flags, \
214223
&(request)->req_base.req_convertor ); \
215224
} while(0)
216225
#endif

ompi/mca/pml/cm/pml_cm_request.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2006 The Regents of the University of California.
1111
* All rights reserved.
12+
* Copyright (c) 2017 Intel, Inc. All rights reserved
1213
* $COPYRIGHT$
1314
*
1415
* Additional copyrights may follow
@@ -53,4 +54,20 @@ struct mca_pml_cm_request_t {
5354
typedef struct mca_pml_cm_request_t mca_pml_cm_request_t;
5455
OBJ_CLASS_DECLARATION(mca_pml_cm_request_t);
5556

57+
/*
58+
* Avoid CUDA convertor inits only for contiguous memory and if indicated by
59+
* the MTL. For non-contiguous memory, do not skip CUDA convertor init phases.
60+
*/
61+
#if OPAL_CUDA_SUPPORT
62+
#define MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count) \
63+
{ \
64+
if (opal_datatype_is_contiguous_memory_layout(&datatype->super, count) \
65+
&& (ompi_mtl->mtl_flags & MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE)) { \
66+
flags |= CONVERTOR_SKIP_CUDA_INIT; \
67+
} \
68+
}
69+
#else
70+
#define MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count)
71+
#endif
72+
5673
#endif

0 commit comments

Comments
 (0)