accelerator/cuda: Add delayed initialization logic

wckzhang · wckzhang · commit 3d59428f79fc · 2023-01-06T19:05:03.000Z
The current implementation requires the application to
do cudaInit before calling MPI_Init. Added delayed
initilization logic to wait as long as possible
before creating resources requiring a cuContext.

Signed-off-by: William Zhang &lt;wilzhang@amazon.com&gt;
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -195,14 +195,18 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
             return 0;
         }
     }
-
+    /* First access on a device pointer finalizes CUDA support initialization. */
+    opal_accelerator_cuda_delayed_init();
     return 1;
 }
 
 static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream)
 {
     CUresult result;
-
+    result = opal_accelerator_cuda_delayed_init();
+    if (0 != result) {
+        return result;
+    }
     *stream = (opal_accelerator_stream_t*)OBJ_NEW(opal_accelerator_cuda_stream_t);
     if (NULL == *stream) {
         return OPAL_ERR_OUT_OF_RESOURCE;
@@ -248,6 +252,10 @@ OBJ_CLASS_INSTANCE(
 static int accelerator_cuda_create_event(int dev_id, opal_accelerator_event_t **event)
 {
     CUresult result;
+    result = opal_accelerator_cuda_delayed_init();
+    if (0 != result) {
+        return result;
+    }
 
     *event = (opal_accelerator_event_t*)OBJ_NEW(opal_accelerator_cuda_event_t);
     if (NULL == *event) {
@@ -340,6 +348,11 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
 {
     CUresult result;
 
+    result = opal_accelerator_cuda_delayed_init();
+    if (0 != result) {
+        return result;
+    }
+
     if (NULL == stream || NULL == dest || NULL == src || size <= 0) {
         return OPAL_ERR_BAD_PARAM;
     }
@@ -358,6 +371,11 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
 {
     CUresult result;
 
+    result = opal_accelerator_cuda_delayed_init();
+    if (0 != result) {
+        return result;
+    }
+
     if (NULL == dest || NULL == src || size <= 0) {
         return OPAL_ERR_BAD_PARAM;
     }
@@ -391,6 +409,11 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
     CUdeviceptr tmp;
     CUresult result;
 
+    result = opal_accelerator_cuda_delayed_init();
+    if (0 != result) {
+        return result;
+    }
+
     if (NULL == dest || NULL == src || size <= 0) {
         return OPAL_ERR_BAD_PARAM;
     }
@@ -425,6 +448,11 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
 {
     CUresult result;
 
+    result = opal_accelerator_cuda_delayed_init();
+    if (0 != result) {
+        return result;
+    }
+
     if (NULL == ptr || 0 == size) {
         return OPAL_ERR_BAD_PARAM;
     }
@@ -459,6 +487,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
 {
     CUresult result;
 
+    result = opal_accelerator_cuda_delayed_init();
+    if (0 != result) {
+        return result;
+    }
+
     if (NULL == ptr || NULL == base || NULL == size) {
         return OPAL_ERR_BAD_PARAM;
     }
@@ -479,6 +512,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
 static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size)
 {
     CUresult result;
+    result = opal_accelerator_cuda_delayed_init();
+    if (0 != result) {
+        return result;
+    }
+
     if (NULL == ptr && size > 0) {
         return OPAL_ERR_BAD_PARAM;
     }
@@ -512,6 +550,11 @@ static int accelerator_cuda_get_device(int *dev_id)
     CUdevice cuDev;
     CUresult result;
 
+    result = opal_accelerator_cuda_delayed_init();
+    if (0 != result) {
+        return result;
+    }
+
     if (NULL == dev_id) {
         return OPAL_ERR_BAD_PARAM;
     }
@@ -530,6 +573,11 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
 {
     CUresult result;
 
+    result = opal_accelerator_cuda_delayed_init();
+    if (0 != result) {
+        return result;
+    }
+
     if (NULL == access) {
         return OPAL_ERR_BAD_PARAM;
     }
@@ -554,6 +602,12 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
 {
     CUresult result;
     int enable = 1;
+
+    result = opal_accelerator_cuda_delayed_init();
+    if (0 != result) {
+        return result;
+    }
+
     result = cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr);
     if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
         opal_show_help("help-accelerator-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME,
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h
@@ -45,6 +45,8 @@ OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_comp
 
 OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;
 
+OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);
+
 END_C_DECLS
 
 #endif /* MCA_ACCELERATOR_CUDA_H */
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
@@ -31,12 +31,16 @@
 #include "opal/util/printf.h"
 #include "opal/util/proc.h"
 #include "opal/util/show_help.h"
-
+#include "opal/sys/atomic.h"
 
 /* Define global variables, used in accelerator_cuda.c */
 CUstream opal_accelerator_cuda_memcpy_stream = NULL;
 opal_mutex_t opal_accelerator_cuda_stream_lock = {0};
 
+/* Initialization lock for delayed cuda initialization */
+static opal_mutex_t accelerator_cuda_init_lock;
+static bool accelerator_cuda_init_complete = false;
+
 #define STRINGIFY2(x) #x
 #define STRINGIFY(x)  STRINGIFY2(x)
 
@@ -115,30 +119,34 @@ static int accelerator_cuda_component_register(void)
     return OPAL_SUCCESS;
 }
 
-static opal_accelerator_base_module_t* accelerator_cuda_init(void)
+int opal_accelerator_cuda_delayed_init()
 {
-    int retval, i, j;
-    CUresult result;
+    CUresult result = OPAL_SUCCESS;
     CUcontext cuContext;
 
-    OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
+    /* Double checked locking to avoid having to
+     * grab locks post lazy-initialization.  */
+    opal_atomic_rmb();
+    if (true == accelerator_cuda_init_complete) {
+        return OPAL_SUCCESS;
+    }
+    OPAL_THREAD_LOCK(&accelerator_cuda_init_lock);
 
-    /* First check if the support is enabled.  In the case that the user has
-     * turned it off, we do not need to continue with any CUDA specific
-     * initialization.  Do this after MCA parameter registration. */
-    if (!opal_cuda_support) {
-        return NULL;
+    /* If already initialized, just exit */
+    if (true == accelerator_cuda_init_complete) {
+        goto out;
     }
 
     /* Check to see if this process is running in a CUDA context.  If
      * so, all is good.  If not, then disable registration of memory. */
     result = cuCtxGetCurrent(&cuContext);
     if (CUDA_SUCCESS != result) {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent failed");
-        return NULL;
+        goto out;
     } else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context");
-        return NULL;
+        result = OPAL_ERROR;
+        goto out;
     } else {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded");
     }
@@ -148,7 +156,7 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
     if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
-        return NULL;
+        goto out;
     }
 
     result = cuMemHostRegister(&checkmem, sizeof(int), 0);
@@ -162,7 +170,29 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
                             "CUDA: cuMemHostRegister OK on test region");
     }
+    result = OPAL_SUCCESS;
+    opal_atomic_wmb();
+    accelerator_cuda_init_complete = true;
+out:
+    OPAL_THREAD_UNLOCK(&accelerator_cuda_init_lock);
+    return result;
+}
+
+static opal_accelerator_base_module_t* accelerator_cuda_init(void)
+{
+    int retval, i, j;
+    CUresult result;
+
+    OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
+    OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t);
+    /* First check if the support is enabled.  In the case that the user has
+     * turned it off, we do not need to continue with any CUDA specific
+     * initialization.  Do this after MCA parameter registration. */
+    if (!opal_cuda_support) {
+        return NULL;
+    }
 
+    opal_accelerator_cuda_delayed_init();
     return &opal_accelerator_cuda_module;
 }
 
@@ -183,5 +213,6 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
     }
 
     OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock);
+    OBJ_DESTRUCT(&accelerator_cuda_init_lock);
     return;
 }