opal/cuda: Handle VMM pointers in cuda_check_addr

Akshay-Venkatesh · Akshay-Venkatesh · commit 86b9876bb5f5 · 2024-08-26T13:47:41.000-07:00
Signed-off-by: Akshay Venkatesh &lt;akvenkatesh@nvidia.com&gt;
diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4
@@ -121,7 +121,7 @@ AS_IF([test "$opal_check_cuda_happy" = "yes"],
 
 # If we have CUDA support, check to see if we have support for cuMemCreate memory on host NUMA.
 AS_IF([test "$opal_check_cuda_happy"="yes"],
-    [AC_CHECK_DECL([CU_MEM_LOCATION_TYPE_HOST_NUMA], [CUDA_HOST_NUMA_SUPPORT=1], [CUDA_HOST_NUMA_SUPPORT=0],
+    [AC_CHECK_DECL([CU_MEM_LOCATION_TYPE_HOST_NUMA], [CUDA_VMM_SUPPORT=1], [CUDA_VMM_SUPPORT=0],
         [#include <$opal_cuda_incdir/cuda.h>])],
     [])
 
@@ -167,8 +167,8 @@ AM_CONDITIONAL([OPAL_cuda_support], [test "x$CUDA_SUPPORT" = "x1"])
 AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT],$CUDA_SUPPORT,
                    [Whether we want cuda device pointer support])
 
-AM_CONDITIONAL([OPAL_cuda_host_numa_support], [test "x$CUDA_HOST_NUMA_SUPPORT" = "x1"])
-AC_DEFINE_UNQUOTED([OPAL_CUDA_HOST_NUMA_SUPPORT],$CUDA_HOST_NUMA_SUPPORT,
+AM_CONDITIONAL([OPAL_cuda_vmm_support], [test "x$CUDA_VMM_SUPPORT" = "x1"])
+AC_DEFINE_UNQUOTED([OPAL_CUDA_VMM_SUPPORT],$CUDA_VMM_SUPPORT,
                    [Whether we have CU_MEM_LOCATION_TYPE_HOST_NUMA support available])
 
 AM_CONDITIONAL([OPAL_cuda_sync_memops], [test "x$CUDA_SYNC_MEMOPS" = "x1"])
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -154,49 +154,75 @@ static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
     return dev_id;
 }
 
-static CUmemorytype accelerator_cuda_check_host_numa(CUdeviceptr dbuf)
+static int accelerator_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type,
+                                      int *dev_id)
 {
-#if OPAL_CUDA_HOST_NUMA_SUPPORT
+#if OPAL_CUDA_VMM_SUPPORT
+    static int device_count = -1;
     CUmemAllocationProp prop;
     CUmemLocation location;
     CUresult result;
     unsigned long long flags;
     CUmemGenericAllocationHandle alloc_handle;
-    /* Check if memory is allocated using VMM API and see if host memory needs
-     * to be treated as pinned device memory */
+
+    if (device_count == -1) {
+        result = cuDeviceGetCount(&device_count);
+        if (result != CUDA_SUCCESS) {
+            return 0;
+        }
+    }
+
     result = cuMemRetainAllocationHandle(&alloc_handle, (void*)dbuf);
     if (result != CUDA_SUCCESS) {
-        return CU_MEMORYTYPE_HOST;
+        return 0;
     }
 
     result = cuMemGetAllocationPropertiesFromHandle(&prop, alloc_handle);
     if (result != CUDA_SUCCESS) {
-        return CU_MEMORYTYPE_HOST;
-    }
-
-    if ((CU_MEM_LOCATION_TYPE_HOST == prop.location.type) ||
-        (CU_MEM_LOCATION_TYPE_HOST_NUMA == prop.location.type) ||
-        (CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT == prop.location.type)) {
-        /* If host has read-write access, then range is accessible by CPU */
-        result = cuMemGetAccess(&flags, &location, dbuf);
-        if ((CUDA_SUCCESS == result) &&
-            ((CU_MEM_LOCATION_TYPE_HOST == location.type) ||
-             (CU_MEM_LOCATION_TYPE_HOST_NUMA == location.type) ||
-             (CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT == location.type)) &&
-            (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) {
-            return CU_MEMORYTYPE_HOST;
-        } else {
-            return CU_MEMORYTYPE_DEVICE;
+        cuMemRelease(alloc_handle);
+        return 0;
+    }
+
+    if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) {
+        *mem_type = CU_MEMORYTYPE_DEVICE;
+        *dev_id  = prop.location.id;
+        cuMemRelease(alloc_handle);
+        return 1;
+    }
+
+    if (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) {
+        /* check if device has access */
+        for (int i = 0; i < device_count; i++) {
+            location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            location.id   = i;
+            result = cuMemGetAccess(&flags, &location, dbuf);
+            if ((CUDA_SUCCESS == result) &&
+                (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) {
+                *mem_type = CU_MEMORYTYPE_DEVICE;
+                *dev_id  = i;
+                cuMemRelease(alloc_handle);
+                return 1;
+            }
         }
     }
-#else
-    return CU_MEMORYTYPE_HOST;
+
+    /* host must have access as device access possibility is exhausted */
+    *mem_type = CU_MEMORYTYPE_HOST;
+    *dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+    cuMemRelease(alloc_handle);
+    return 1;
+
 #endif
+
+    return 0;
 }
 
 static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
 {
     CUresult result;
+    int is_vmm = 0;
+    int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+    CUmemorytype vmm_mem_type = 0;
     CUmemorytype mem_type = 0;
     CUdeviceptr dbuf = (CUdeviceptr) addr;
     CUcontext ctx = NULL, mem_ctx = NULL;
@@ -208,6 +234,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
 
     *flags = 0;
 
+    is_vmm = accelerator_cuda_check_vmm(dbuf, &vmm_mem_type, &vmm_dev_id);
+
 #if OPAL_CUDA_GET_ATTRIBUTES
     uint32_t is_managed = 0;
     /* With CUDA 7.0, we can get multiple attributes with a single call */
@@ -237,20 +265,24 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
             return OPAL_ERROR;
         }
     } else if (CU_MEMORYTYPE_HOST == mem_type) {
-        mem_type = accelerator_cuda_check_host_numa(dbuf);
-        if (CU_MEMORYTYPE_HOST == mem_type) {
+        if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
+            mem_type = CU_MEMORYTYPE_DEVICE;
+            *dev_id = vmm_dev_id;
+        } else {
             /* Host memory, nothing to do here */
             return 0;
         }
     } else if (0 == mem_type) {
         /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
         return 0;
     } else {
-        /* query the device from the context */
-        *dev_id = accelerator_cuda_get_device_id(mem_ctx);
+        if (is_vmm) {
+            *dev_id = vmm_dev_id;
+        } else {
+            /* query the device from the context */
+            *dev_id = accelerator_cuda_get_device_id(mem_ctx);
+        }
     }
-    /* Must be a device pointer */
-    assert(CU_MEMORYTYPE_DEVICE == mem_type);
 #else /* OPAL_CUDA_GET_ATTRIBUTES */
     result = cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
     if (CUDA_SUCCESS != result) {
@@ -261,19 +293,27 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
             return OPAL_ERROR;
         }
     } else if (CU_MEMORYTYPE_HOST == mem_type) {
-        mem_type = accelerator_cuda_check_host_numa(dbuf);
-        if (CU_MEMORYTYPE_HOST == mem_type) {
+        if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
+            mem_type = CU_MEMORYTYPE_DEVICE;
+            *dev_id = vmm_dev_id;
+        } else {
             /* Host memory, nothing to do here */
             return 0;
         }
     } else {
-        result = cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
-        /* query the device from the context */
-        *dev_id = accelerator_cuda_get_device_id(mem_ctx);
+        if (is_vmm) {
+            *dev_id = vmm_dev_id;
+        } else {
+            result = cuPointerGetAttribute(&mem_ctx,
+                                           CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
+            /* query the device from the context */
+            *dev_id = accelerator_cuda_get_device_id(mem_ctx);
+        }
     }
+#endif /* OPAL_CUDA_GET_ATTRIBUTES */
+
     /* Must be a device pointer */
     assert(CU_MEMORYTYPE_DEVICE == mem_type);
-#endif /* OPAL_CUDA_GET_ATTRIBUTES */
 
     /* This piece of code was added in to handle in a case involving
      * OMP threads.  The user had initialized CUDA and then spawned
@@ -296,6 +336,16 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
                 return OPAL_ERROR;
             }
 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
+            if (is_vmm) {
+                /* This function is expected to set context if pointer is device
+                 * accessible but VMM allocations have NULL context associated
+                 * which cannot be set against the calling thread */
+                opal_output(0,
+                        "CUDA: unable to set context with the given pointer"
+                        "ptr=%p aborting...", addr);
+                return OPAL_ERROR;
+            }
+
             result = cuCtxSetCurrent(mem_ctx);
             if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
                 opal_output(0,