Skip to content

Commit 3de5332

Browse files
committed
accelerator/cuda: Add delayed initialization logic
The current implementation requires the application to do cudaInit before calling MPI_Init. Added delayed initilization logic to wait as long as possible before creating resources requiring a cuContext. Signed-off-by: William Zhang <[email protected]>
1 parent 4e8bc42 commit 3de5332

File tree

3 files changed

+94
-15
lines changed

3 files changed

+94
-15
lines changed

opal/mca/accelerator/cuda/accelerator_cuda.c

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,14 +195,18 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
195195
return 0;
196196
}
197197
}
198-
198+
/* First access on a device pointer finalizes CUDA support initialization. */
199+
opal_accelerator_cuda_delayed_init();
199200
return 1;
200201
}
201202

202203
static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream)
203204
{
204205
CUresult result;
205-
206+
result = opal_accelerator_cuda_delayed_init();
207+
if (0 != result) {
208+
return result;
209+
}
206210
*stream = (opal_accelerator_stream_t*)OBJ_NEW(opal_accelerator_cuda_stream_t);
207211
if (NULL == *stream) {
208212
return OPAL_ERR_OUT_OF_RESOURCE;
@@ -248,6 +252,10 @@ OBJ_CLASS_INSTANCE(
248252
static int accelerator_cuda_create_event(int dev_id, opal_accelerator_event_t **event)
249253
{
250254
CUresult result;
255+
result = opal_accelerator_cuda_delayed_init();
256+
if (0 != result) {
257+
return result;
258+
}
251259

252260
*event = (opal_accelerator_event_t*)OBJ_NEW(opal_accelerator_cuda_event_t);
253261
if (NULL == *event) {
@@ -340,6 +348,11 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
340348
{
341349
CUresult result;
342350

351+
result = opal_accelerator_cuda_delayed_init();
352+
if (0 != result) {
353+
return result;
354+
}
355+
343356
if (NULL == stream || NULL == dest || NULL == src || size <= 0) {
344357
return OPAL_ERR_BAD_PARAM;
345358
}
@@ -358,6 +371,11 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
358371
{
359372
CUresult result;
360373

374+
result = opal_accelerator_cuda_delayed_init();
375+
if (0 != result) {
376+
return result;
377+
}
378+
361379
if (NULL == dest || NULL == src || size <= 0) {
362380
return OPAL_ERR_BAD_PARAM;
363381
}
@@ -391,6 +409,11 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
391409
CUdeviceptr tmp;
392410
CUresult result;
393411

412+
result = opal_accelerator_cuda_delayed_init();
413+
if (0 != result) {
414+
return result;
415+
}
416+
394417
if (NULL == dest || NULL == src || size <= 0) {
395418
return OPAL_ERR_BAD_PARAM;
396419
}
@@ -425,6 +448,11 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
425448
{
426449
CUresult result;
427450

451+
result = opal_accelerator_cuda_delayed_init();
452+
if (0 != result) {
453+
return result;
454+
}
455+
428456
if (NULL == ptr || 0 == size) {
429457
return OPAL_ERR_BAD_PARAM;
430458
}
@@ -459,6 +487,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
459487
{
460488
CUresult result;
461489

490+
result = opal_accelerator_cuda_delayed_init();
491+
if (0 != result) {
492+
return result;
493+
}
494+
462495
if (NULL == ptr || NULL == base || NULL == size) {
463496
return OPAL_ERR_BAD_PARAM;
464497
}
@@ -479,6 +512,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
479512
static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size)
480513
{
481514
CUresult result;
515+
result = opal_accelerator_cuda_delayed_init();
516+
if (0 != result) {
517+
return result;
518+
}
519+
482520
if (NULL == ptr && size > 0) {
483521
return OPAL_ERR_BAD_PARAM;
484522
}
@@ -512,6 +550,11 @@ static int accelerator_cuda_get_device(int *dev_id)
512550
CUdevice cuDev;
513551
CUresult result;
514552

553+
result = opal_accelerator_cuda_delayed_init();
554+
if (0 != result) {
555+
return result;
556+
}
557+
515558
if (NULL == dev_id) {
516559
return OPAL_ERR_BAD_PARAM;
517560
}
@@ -530,6 +573,11 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
530573
{
531574
CUresult result;
532575

576+
result = opal_accelerator_cuda_delayed_init();
577+
if (0 != result) {
578+
return result;
579+
}
580+
533581
if (NULL == access) {
534582
return OPAL_ERR_BAD_PARAM;
535583
}
@@ -554,6 +602,12 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
554602
{
555603
CUresult result;
556604
int enable = 1;
605+
606+
result = opal_accelerator_cuda_delayed_init();
607+
if (0 != result) {
608+
return result;
609+
}
610+
557611
result = cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr);
558612
if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
559613
opal_show_help("help-accelerator-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME,

opal/mca/accelerator/cuda/accelerator_cuda.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_comp
4545

4646
OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;
4747

48+
OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);
49+
4850
END_C_DECLS
4951

5052
#endif /* MCA_ACCELERATOR_CUDA_H */

opal/mca/accelerator/cuda/accelerator_cuda_component.c

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@
3737
CUstream opal_accelerator_cuda_memcpy_stream = NULL;
3838
opal_mutex_t opal_accelerator_cuda_stream_lock = {0};
3939

40+
/* Initialization lock for delayed cuda initialization */
41+
static opal_mutex_t accelerator_cuda_init_lock;
42+
static bool accelerator_cuda_init_complete = false;
43+
4044
#define STRINGIFY2(x) #x
4145
#define STRINGIFY(x) STRINGIFY2(x)
4246

@@ -115,30 +119,27 @@ static int accelerator_cuda_component_register(void)
115119
return OPAL_SUCCESS;
116120
}
117121

118-
static opal_accelerator_base_module_t* accelerator_cuda_init(void)
122+
int opal_accelerator_cuda_delayed_init()
119123
{
120-
int retval, i, j;
121-
CUresult result;
124+
CUresult result = OPAL_SUCCESS;
122125
CUcontext cuContext;
126+
OPAL_THREAD_LOCK(&accelerator_cuda_init_lock);
123127

124-
OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
125-
126-
/* First check if the support is enabled. In the case that the user has
127-
* turned it off, we do not need to continue with any CUDA specific
128-
* initialization. Do this after MCA parameter registration. */
129-
if (!opal_cuda_support) {
130-
return NULL;
128+
/* If already initialized, just exit */
129+
if (true == accelerator_cuda_init_complete) {
130+
goto out;
131131
}
132132

133133
/* Check to see if this process is running in a CUDA context. If
134134
* so, all is good. If not, then disable registration of memory. */
135135
result = cuCtxGetCurrent(&cuContext);
136136
if (CUDA_SUCCESS != result) {
137137
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent failed");
138-
return NULL;
138+
goto out;
139139
} else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) {
140140
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context");
141-
return NULL;
141+
result = OPAL_ERROR;
142+
goto out;
142143
} else {
143144
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded");
144145
}
@@ -148,7 +149,7 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
148149
if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
149150
opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
150151
OPAL_PROC_MY_HOSTNAME, result);
151-
return NULL;
152+
goto out;
152153
}
153154

154155
result = cuMemHostRegister(&checkmem, sizeof(int), 0);
@@ -162,7 +163,28 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
162163
opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
163164
"CUDA: cuMemHostRegister OK on test region");
164165
}
166+
result = OPAL_SUCCESS;
167+
accelerator_cuda_init_complete = true;
168+
out:
169+
OPAL_THREAD_UNLOCK(&accelerator_cuda_init_lock);
170+
return result;
171+
}
172+
173+
static opal_accelerator_base_module_t* accelerator_cuda_init(void)
174+
{
175+
int retval, i, j;
176+
CUresult result;
177+
178+
OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
179+
OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t);
180+
/* First check if the support is enabled. In the case that the user has
181+
* turned it off, we do not need to continue with any CUDA specific
182+
* initialization. Do this after MCA parameter registration. */
183+
if (!opal_cuda_support) {
184+
return NULL;
185+
}
165186

187+
opal_accelerator_cuda_delayed_init();
166188
return &opal_accelerator_cuda_module;
167189
}
168190

@@ -183,5 +205,6 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
183205
}
184206

185207
OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock);
208+
OBJ_DESTRUCT(&accelerator_cuda_init_lock);
186209
return;
187210
}

0 commit comments

Comments
 (0)