Skip to content

Commit 86b9876

Browse files
opal/cuda: Handle VMM pointers in cuda_check_addr
Signed-off-by: Akshay Venkatesh <[email protected]>
1 parent d7b2676 commit 86b9876

File tree

2 files changed

+88
-38
lines changed

2 files changed

+88
-38
lines changed

config/opal_check_cuda.m4

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ AS_IF([test "$opal_check_cuda_happy" = "yes"],
121121
122122
# If we have CUDA support, check to see if we have support for cuMemCreate memory on host NUMA.
123123
AS_IF([test "$opal_check_cuda_happy"="yes"],
124-
[AC_CHECK_DECL([CU_MEM_LOCATION_TYPE_HOST_NUMA], [CUDA_HOST_NUMA_SUPPORT=1], [CUDA_HOST_NUMA_SUPPORT=0],
124+
[AC_CHECK_DECL([CU_MEM_LOCATION_TYPE_HOST_NUMA], [CUDA_VMM_SUPPORT=1], [CUDA_VMM_SUPPORT=0],
125125
[#include <$opal_cuda_incdir/cuda.h>])],
126126
[])
127127
@@ -167,8 +167,8 @@ AM_CONDITIONAL([OPAL_cuda_support], [test "x$CUDA_SUPPORT" = "x1"])
167167
AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT],$CUDA_SUPPORT,
168168
[Whether we want cuda device pointer support])
169169
170-
AM_CONDITIONAL([OPAL_cuda_host_numa_support], [test "x$CUDA_HOST_NUMA_SUPPORT" = "x1"])
171-
AC_DEFINE_UNQUOTED([OPAL_CUDA_HOST_NUMA_SUPPORT],$CUDA_HOST_NUMA_SUPPORT,
170+
AM_CONDITIONAL([OPAL_cuda_vmm_support], [test "x$CUDA_VMM_SUPPORT" = "x1"])
171+
AC_DEFINE_UNQUOTED([OPAL_CUDA_VMM_SUPPORT],$CUDA_VMM_SUPPORT,
172172
[Whether we have CU_MEM_LOCATION_TYPE_HOST_NUMA support available])
173173
174174
AM_CONDITIONAL([OPAL_cuda_sync_memops], [test "x$CUDA_SYNC_MEMOPS" = "x1"])

opal/mca/accelerator/cuda/accelerator_cuda.c

Lines changed: 85 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -154,49 +154,75 @@ static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
154154
return dev_id;
155155
}
156156

157-
static CUmemorytype accelerator_cuda_check_host_numa(CUdeviceptr dbuf)
157+
static int accelerator_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type,
158+
int *dev_id)
158159
{
159-
#if OPAL_CUDA_HOST_NUMA_SUPPORT
160+
#if OPAL_CUDA_VMM_SUPPORT
161+
static int device_count = -1;
160162
CUmemAllocationProp prop;
161163
CUmemLocation location;
162164
CUresult result;
163165
unsigned long long flags;
164166
CUmemGenericAllocationHandle alloc_handle;
165-
/* Check if memory is allocated using VMM API and see if host memory needs
166-
* to be treated as pinned device memory */
167+
168+
if (device_count == -1) {
169+
result = cuDeviceGetCount(&device_count);
170+
if (result != CUDA_SUCCESS) {
171+
return 0;
172+
}
173+
}
174+
167175
result = cuMemRetainAllocationHandle(&alloc_handle, (void*)dbuf);
168176
if (result != CUDA_SUCCESS) {
169-
return CU_MEMORYTYPE_HOST;
177+
return 0;
170178
}
171179

172180
result = cuMemGetAllocationPropertiesFromHandle(&prop, alloc_handle);
173181
if (result != CUDA_SUCCESS) {
174-
return CU_MEMORYTYPE_HOST;
175-
}
176-
177-
if ((CU_MEM_LOCATION_TYPE_HOST == prop.location.type) ||
178-
(CU_MEM_LOCATION_TYPE_HOST_NUMA == prop.location.type) ||
179-
(CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT == prop.location.type)) {
180-
/* If host has read-write access, then range is accessible by CPU */
181-
result = cuMemGetAccess(&flags, &location, dbuf);
182-
if ((CUDA_SUCCESS == result) &&
183-
((CU_MEM_LOCATION_TYPE_HOST == location.type) ||
184-
(CU_MEM_LOCATION_TYPE_HOST_NUMA == location.type) ||
185-
(CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT == location.type)) &&
186-
(CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) {
187-
return CU_MEMORYTYPE_HOST;
188-
} else {
189-
return CU_MEMORYTYPE_DEVICE;
182+
cuMemRelease(alloc_handle);
183+
return 0;
184+
}
185+
186+
if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) {
187+
*mem_type = CU_MEMORYTYPE_DEVICE;
188+
*dev_id = prop.location.id;
189+
cuMemRelease(alloc_handle);
190+
return 1;
191+
}
192+
193+
if (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) {
194+
/* check if device has access */
195+
for (int i = 0; i < device_count; i++) {
196+
location.type = CU_MEM_LOCATION_TYPE_DEVICE;
197+
location.id = i;
198+
result = cuMemGetAccess(&flags, &location, dbuf);
199+
if ((CUDA_SUCCESS == result) &&
200+
(CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) {
201+
*mem_type = CU_MEMORYTYPE_DEVICE;
202+
*dev_id = i;
203+
cuMemRelease(alloc_handle);
204+
return 1;
205+
}
190206
}
191207
}
192-
#else
193-
return CU_MEMORYTYPE_HOST;
208+
209+
/* host must have access as device access possibility is exhausted */
210+
*mem_type = CU_MEMORYTYPE_HOST;
211+
*dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
212+
cuMemRelease(alloc_handle);
213+
return 1;
214+
194215
#endif
216+
217+
return 0;
195218
}
196219

197220
static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
198221
{
199222
CUresult result;
223+
int is_vmm = 0;
224+
int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
225+
CUmemorytype vmm_mem_type = 0;
200226
CUmemorytype mem_type = 0;
201227
CUdeviceptr dbuf = (CUdeviceptr) addr;
202228
CUcontext ctx = NULL, mem_ctx = NULL;
@@ -208,6 +234,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
208234

209235
*flags = 0;
210236

237+
is_vmm = accelerator_cuda_check_vmm(dbuf, &vmm_mem_type, &vmm_dev_id);
238+
211239
#if OPAL_CUDA_GET_ATTRIBUTES
212240
uint32_t is_managed = 0;
213241
/* With CUDA 7.0, we can get multiple attributes with a single call */
@@ -237,20 +265,24 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
237265
return OPAL_ERROR;
238266
}
239267
} else if (CU_MEMORYTYPE_HOST == mem_type) {
240-
mem_type = accelerator_cuda_check_host_numa(dbuf);
241-
if (CU_MEMORYTYPE_HOST == mem_type) {
268+
if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
269+
mem_type = CU_MEMORYTYPE_DEVICE;
270+
*dev_id = vmm_dev_id;
271+
} else {
242272
/* Host memory, nothing to do here */
243273
return 0;
244274
}
245275
} else if (0 == mem_type) {
246276
/* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
247277
return 0;
248278
} else {
249-
/* query the device from the context */
250-
*dev_id = accelerator_cuda_get_device_id(mem_ctx);
279+
if (is_vmm) {
280+
*dev_id = vmm_dev_id;
281+
} else {
282+
/* query the device from the context */
283+
*dev_id = accelerator_cuda_get_device_id(mem_ctx);
284+
}
251285
}
252-
/* Must be a device pointer */
253-
assert(CU_MEMORYTYPE_DEVICE == mem_type);
254286
#else /* OPAL_CUDA_GET_ATTRIBUTES */
255287
result = cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
256288
if (CUDA_SUCCESS != result) {
@@ -261,19 +293,27 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
261293
return OPAL_ERROR;
262294
}
263295
} else if (CU_MEMORYTYPE_HOST == mem_type) {
264-
mem_type = accelerator_cuda_check_host_numa(dbuf);
265-
if (CU_MEMORYTYPE_HOST == mem_type) {
296+
if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
297+
mem_type = CU_MEMORYTYPE_DEVICE;
298+
*dev_id = vmm_dev_id;
299+
} else {
266300
/* Host memory, nothing to do here */
267301
return 0;
268302
}
269303
} else {
270-
result = cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
271-
/* query the device from the context */
272-
*dev_id = accelerator_cuda_get_device_id(mem_ctx);
304+
if (is_vmm) {
305+
*dev_id = vmm_dev_id;
306+
} else {
307+
result = cuPointerGetAttribute(&mem_ctx,
308+
CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
309+
/* query the device from the context */
310+
*dev_id = accelerator_cuda_get_device_id(mem_ctx);
311+
}
273312
}
313+
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
314+
274315
/* Must be a device pointer */
275316
assert(CU_MEMORYTYPE_DEVICE == mem_type);
276-
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
277317

278318
/* This piece of code was added in to handle in a case involving
279319
* OMP threads. The user had initialized CUDA and then spawned
@@ -296,6 +336,16 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
296336
return OPAL_ERROR;
297337
}
298338
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
339+
if (is_vmm) {
340+
/* This function is expected to set context if pointer is device
341+
* accessible but VMM allocations have NULL context associated
342+
* which cannot be set against the calling thread */
343+
opal_output(0,
344+
"CUDA: unable to set context with the given pointer"
345+
"ptr=%p aborting...", addr);
346+
return OPAL_ERROR;
347+
}
348+
299349
result = cuCtxSetCurrent(mem_ctx);
300350
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
301351
opal_output(0,

0 commit comments

Comments
 (0)