Skip to content

Commit 3d59428

Browse files
committed
accelerator/cuda: Add delayed initialization logic
The current implementation requires the application to do cudaInit before calling MPI_Init. Added delayed initilization logic to wait as long as possible before creating resources requiring a cuContext. Signed-off-by: William Zhang <[email protected]>
1 parent 4e8bc42 commit 3d59428

File tree

3 files changed

+102
-15
lines changed

3 files changed

+102
-15
lines changed

opal/mca/accelerator/cuda/accelerator_cuda.c

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,14 +195,18 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
195195
return 0;
196196
}
197197
}
198-
198+
/* First access on a device pointer finalizes CUDA support initialization. */
199+
opal_accelerator_cuda_delayed_init();
199200
return 1;
200201
}
201202

202203
static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream)
203204
{
204205
CUresult result;
205-
206+
result = opal_accelerator_cuda_delayed_init();
207+
if (0 != result) {
208+
return result;
209+
}
206210
*stream = (opal_accelerator_stream_t*)OBJ_NEW(opal_accelerator_cuda_stream_t);
207211
if (NULL == *stream) {
208212
return OPAL_ERR_OUT_OF_RESOURCE;
@@ -248,6 +252,10 @@ OBJ_CLASS_INSTANCE(
248252
static int accelerator_cuda_create_event(int dev_id, opal_accelerator_event_t **event)
249253
{
250254
CUresult result;
255+
result = opal_accelerator_cuda_delayed_init();
256+
if (0 != result) {
257+
return result;
258+
}
251259

252260
*event = (opal_accelerator_event_t*)OBJ_NEW(opal_accelerator_cuda_event_t);
253261
if (NULL == *event) {
@@ -340,6 +348,11 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
340348
{
341349
CUresult result;
342350

351+
result = opal_accelerator_cuda_delayed_init();
352+
if (0 != result) {
353+
return result;
354+
}
355+
343356
if (NULL == stream || NULL == dest || NULL == src || size <= 0) {
344357
return OPAL_ERR_BAD_PARAM;
345358
}
@@ -358,6 +371,11 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
358371
{
359372
CUresult result;
360373

374+
result = opal_accelerator_cuda_delayed_init();
375+
if (0 != result) {
376+
return result;
377+
}
378+
361379
if (NULL == dest || NULL == src || size <= 0) {
362380
return OPAL_ERR_BAD_PARAM;
363381
}
@@ -391,6 +409,11 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
391409
CUdeviceptr tmp;
392410
CUresult result;
393411

412+
result = opal_accelerator_cuda_delayed_init();
413+
if (0 != result) {
414+
return result;
415+
}
416+
394417
if (NULL == dest || NULL == src || size <= 0) {
395418
return OPAL_ERR_BAD_PARAM;
396419
}
@@ -425,6 +448,11 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
425448
{
426449
CUresult result;
427450

451+
result = opal_accelerator_cuda_delayed_init();
452+
if (0 != result) {
453+
return result;
454+
}
455+
428456
if (NULL == ptr || 0 == size) {
429457
return OPAL_ERR_BAD_PARAM;
430458
}
@@ -459,6 +487,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
459487
{
460488
CUresult result;
461489

490+
result = opal_accelerator_cuda_delayed_init();
491+
if (0 != result) {
492+
return result;
493+
}
494+
462495
if (NULL == ptr || NULL == base || NULL == size) {
463496
return OPAL_ERR_BAD_PARAM;
464497
}
@@ -479,6 +512,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
479512
static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size)
480513
{
481514
CUresult result;
515+
result = opal_accelerator_cuda_delayed_init();
516+
if (0 != result) {
517+
return result;
518+
}
519+
482520
if (NULL == ptr && size > 0) {
483521
return OPAL_ERR_BAD_PARAM;
484522
}
@@ -512,6 +550,11 @@ static int accelerator_cuda_get_device(int *dev_id)
512550
CUdevice cuDev;
513551
CUresult result;
514552

553+
result = opal_accelerator_cuda_delayed_init();
554+
if (0 != result) {
555+
return result;
556+
}
557+
515558
if (NULL == dev_id) {
516559
return OPAL_ERR_BAD_PARAM;
517560
}
@@ -530,6 +573,11 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
530573
{
531574
CUresult result;
532575

576+
result = opal_accelerator_cuda_delayed_init();
577+
if (0 != result) {
578+
return result;
579+
}
580+
533581
if (NULL == access) {
534582
return OPAL_ERR_BAD_PARAM;
535583
}
@@ -554,6 +602,12 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
554602
{
555603
CUresult result;
556604
int enable = 1;
605+
606+
result = opal_accelerator_cuda_delayed_init();
607+
if (0 != result) {
608+
return result;
609+
}
610+
557611
result = cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr);
558612
if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
559613
opal_show_help("help-accelerator-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME,

opal/mca/accelerator/cuda/accelerator_cuda.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_comp
4545

4646
OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;
4747

48+
OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);
49+
4850
END_C_DECLS
4951

5052
#endif /* MCA_ACCELERATOR_CUDA_H */

opal/mca/accelerator/cuda/accelerator_cuda_component.c

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,16 @@
3131
#include "opal/util/printf.h"
3232
#include "opal/util/proc.h"
3333
#include "opal/util/show_help.h"
34-
34+
#include "opal/sys/atomic.h"
3535

3636
/* Define global variables, used in accelerator_cuda.c */
3737
CUstream opal_accelerator_cuda_memcpy_stream = NULL;
3838
opal_mutex_t opal_accelerator_cuda_stream_lock = {0};
3939

40+
/* Initialization lock for delayed cuda initialization */
41+
static opal_mutex_t accelerator_cuda_init_lock;
42+
static bool accelerator_cuda_init_complete = false;
43+
4044
#define STRINGIFY2(x) #x
4145
#define STRINGIFY(x) STRINGIFY2(x)
4246

@@ -115,30 +119,34 @@ static int accelerator_cuda_component_register(void)
115119
return OPAL_SUCCESS;
116120
}
117121

118-
static opal_accelerator_base_module_t* accelerator_cuda_init(void)
122+
int opal_accelerator_cuda_delayed_init()
119123
{
120-
int retval, i, j;
121-
CUresult result;
124+
CUresult result = OPAL_SUCCESS;
122125
CUcontext cuContext;
123126

124-
OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
127+
/* Double checked locking to avoid having to
128+
* grab locks post lazy-initialization. */
129+
opal_atomic_rmb();
130+
if (true == accelerator_cuda_init_complete) {
131+
return OPAL_SUCCESS;
132+
}
133+
OPAL_THREAD_LOCK(&accelerator_cuda_init_lock);
125134

126-
/* First check if the support is enabled. In the case that the user has
127-
* turned it off, we do not need to continue with any CUDA specific
128-
* initialization. Do this after MCA parameter registration. */
129-
if (!opal_cuda_support) {
130-
return NULL;
135+
/* If already initialized, just exit */
136+
if (true == accelerator_cuda_init_complete) {
137+
goto out;
131138
}
132139

133140
/* Check to see if this process is running in a CUDA context. If
134141
* so, all is good. If not, then disable registration of memory. */
135142
result = cuCtxGetCurrent(&cuContext);
136143
if (CUDA_SUCCESS != result) {
137144
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent failed");
138-
return NULL;
145+
goto out;
139146
} else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) {
140147
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context");
141-
return NULL;
148+
result = OPAL_ERROR;
149+
goto out;
142150
} else {
143151
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded");
144152
}
@@ -148,7 +156,7 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
148156
if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
149157
opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
150158
OPAL_PROC_MY_HOSTNAME, result);
151-
return NULL;
159+
goto out;
152160
}
153161

154162
result = cuMemHostRegister(&checkmem, sizeof(int), 0);
@@ -162,7 +170,29 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
162170
opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
163171
"CUDA: cuMemHostRegister OK on test region");
164172
}
173+
result = OPAL_SUCCESS;
174+
opal_atomic_wmb();
175+
accelerator_cuda_init_complete = true;
176+
out:
177+
OPAL_THREAD_UNLOCK(&accelerator_cuda_init_lock);
178+
return result;
179+
}
180+
181+
static opal_accelerator_base_module_t* accelerator_cuda_init(void)
182+
{
183+
int retval, i, j;
184+
CUresult result;
185+
186+
OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
187+
OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t);
188+
/* First check if the support is enabled. In the case that the user has
189+
* turned it off, we do not need to continue with any CUDA specific
190+
* initialization. Do this after MCA parameter registration. */
191+
if (!opal_cuda_support) {
192+
return NULL;
193+
}
165194

195+
opal_accelerator_cuda_delayed_init();
166196
return &opal_accelerator_cuda_module;
167197
}
168198

@@ -183,5 +213,6 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
183213
}
184214

185215
OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock);
216+
OBJ_DESTRUCT(&accelerator_cuda_init_lock);
186217
return;
187218
}

0 commit comments

Comments
 (0)