@@ -154,49 +154,75 @@ static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
154
154
return dev_id ;
155
155
}
156
156
157
- static CUmemorytype accelerator_cuda_check_host_numa (CUdeviceptr dbuf )
157
+ static int accelerator_cuda_check_vmm (CUdeviceptr dbuf , CUmemorytype * mem_type ,
158
+ int * dev_id )
158
159
{
159
- #if OPAL_CUDA_HOST_NUMA_SUPPORT
160
+ #if OPAL_CUDA_VMM_SUPPORT
161
+ static int device_count = -1 ;
160
162
CUmemAllocationProp prop ;
161
163
CUmemLocation location ;
162
164
CUresult result ;
163
165
unsigned long long flags ;
164
166
CUmemGenericAllocationHandle alloc_handle ;
165
- /* Check if memory is allocated using VMM API and see if host memory needs
166
- * to be treated as pinned device memory */
167
+
168
+ if (device_count == -1 ) {
169
+ result = cuDeviceGetCount (& device_count );
170
+ if (result != CUDA_SUCCESS ) {
171
+ return 0 ;
172
+ }
173
+ }
174
+
167
175
result = cuMemRetainAllocationHandle (& alloc_handle , (void * )dbuf );
168
176
if (result != CUDA_SUCCESS ) {
169
- return CU_MEMORYTYPE_HOST ;
177
+ return 0 ;
170
178
}
171
179
172
180
result = cuMemGetAllocationPropertiesFromHandle (& prop , alloc_handle );
173
181
if (result != CUDA_SUCCESS ) {
174
- return CU_MEMORYTYPE_HOST ;
175
- }
176
-
177
- if ((CU_MEM_LOCATION_TYPE_HOST == prop .location .type ) ||
178
- (CU_MEM_LOCATION_TYPE_HOST_NUMA == prop .location .type ) ||
179
- (CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT == prop .location .type )) {
180
- /* If host has read-write access, then range is accessible by CPU */
181
- result = cuMemGetAccess (& flags , & location , dbuf );
182
- if ((CUDA_SUCCESS == result ) &&
183
- ((CU_MEM_LOCATION_TYPE_HOST == location .type ) ||
184
- (CU_MEM_LOCATION_TYPE_HOST_NUMA == location .type ) ||
185
- (CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT == location .type )) &&
186
- (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags )) {
187
- return CU_MEMORYTYPE_HOST ;
188
- } else {
189
- return CU_MEMORYTYPE_DEVICE ;
182
+ cuMemRelease (alloc_handle );
183
+ return 0 ;
184
+ }
185
+
186
+ if (prop .location .type == CU_MEM_LOCATION_TYPE_DEVICE ) {
187
+ * mem_type = CU_MEMORYTYPE_DEVICE ;
188
+ * dev_id = prop .location .id ;
189
+ cuMemRelease (alloc_handle );
190
+ return 1 ;
191
+ }
192
+
193
+ if (prop .location .type == CU_MEM_LOCATION_TYPE_HOST_NUMA ) {
194
+ /* check if device has access */
195
+ for (int i = 0 ; i < device_count ; i ++ ) {
196
+ location .type = CU_MEM_LOCATION_TYPE_DEVICE ;
197
+ location .id = i ;
198
+ result = cuMemGetAccess (& flags , & location , dbuf );
199
+ if ((CUDA_SUCCESS == result ) &&
200
+ (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags )) {
201
+ * mem_type = CU_MEMORYTYPE_DEVICE ;
202
+ * dev_id = i ;
203
+ cuMemRelease (alloc_handle );
204
+ return 1 ;
205
+ }
190
206
}
191
207
}
192
- #else
193
- return CU_MEMORYTYPE_HOST ;
208
+
209
+ /* host must have access as device access possibility is exhausted */
210
+ * mem_type = CU_MEMORYTYPE_HOST ;
211
+ * dev_id = MCA_ACCELERATOR_NO_DEVICE_ID ;
212
+ cuMemRelease (alloc_handle );
213
+ return 1 ;
214
+
194
215
#endif
216
+
217
+ return 0 ;
195
218
}
196
219
197
220
static int accelerator_cuda_check_addr (const void * addr , int * dev_id , uint64_t * flags )
198
221
{
199
222
CUresult result ;
223
+ int is_vmm = 0 ;
224
+ int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID ;
225
+ CUmemorytype vmm_mem_type = 0 ;
200
226
CUmemorytype mem_type = 0 ;
201
227
CUdeviceptr dbuf = (CUdeviceptr ) addr ;
202
228
CUcontext ctx = NULL , mem_ctx = NULL ;
@@ -208,6 +234,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
208
234
209
235
* flags = 0 ;
210
236
237
+ is_vmm = accelerator_cuda_check_vmm (dbuf , & vmm_mem_type , & vmm_dev_id );
238
+
211
239
#if OPAL_CUDA_GET_ATTRIBUTES
212
240
uint32_t is_managed = 0 ;
213
241
/* With CUDA 7.0, we can get multiple attributes with a single call */
@@ -237,20 +265,24 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
237
265
return OPAL_ERROR ;
238
266
}
239
267
} else if (CU_MEMORYTYPE_HOST == mem_type ) {
240
- mem_type = accelerator_cuda_check_host_numa (dbuf );
241
- if (CU_MEMORYTYPE_HOST == mem_type ) {
268
+ if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE )) {
269
+ mem_type = CU_MEMORYTYPE_DEVICE ;
270
+ * dev_id = vmm_dev_id ;
271
+ } else {
242
272
/* Host memory, nothing to do here */
243
273
return 0 ;
244
274
}
245
275
} else if (0 == mem_type ) {
246
276
/* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
247
277
return 0 ;
248
278
} else {
249
- /* query the device from the context */
250
- * dev_id = accelerator_cuda_get_device_id (mem_ctx );
279
+ if (is_vmm ) {
280
+ * dev_id = vmm_dev_id ;
281
+ } else {
282
+ /* query the device from the context */
283
+ * dev_id = accelerator_cuda_get_device_id (mem_ctx );
284
+ }
251
285
}
252
- /* Must be a device pointer */
253
- assert (CU_MEMORYTYPE_DEVICE == mem_type );
254
286
#else /* OPAL_CUDA_GET_ATTRIBUTES */
255
287
result = cuPointerGetAttribute (& mem_type , CU_POINTER_ATTRIBUTE_MEMORY_TYPE , dbuf );
256
288
if (CUDA_SUCCESS != result ) {
@@ -261,19 +293,27 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
261
293
return OPAL_ERROR ;
262
294
}
263
295
} else if (CU_MEMORYTYPE_HOST == mem_type ) {
264
- mem_type = accelerator_cuda_check_host_numa (dbuf );
265
- if (CU_MEMORYTYPE_HOST == mem_type ) {
296
+ if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE )) {
297
+ mem_type = CU_MEMORYTYPE_DEVICE ;
298
+ * dev_id = vmm_dev_id ;
299
+ } else {
266
300
/* Host memory, nothing to do here */
267
301
return 0 ;
268
302
}
269
303
} else {
270
- result = cuPointerGetAttribute (& mem_ctx , CU_POINTER_ATTRIBUTE_CONTEXT , dbuf );
271
- /* query the device from the context */
272
- * dev_id = accelerator_cuda_get_device_id (mem_ctx );
304
+ if (is_vmm ) {
305
+ * dev_id = vmm_dev_id ;
306
+ } else {
307
+ result = cuPointerGetAttribute (& mem_ctx ,
308
+ CU_POINTER_ATTRIBUTE_CONTEXT , dbuf );
309
+ /* query the device from the context */
310
+ * dev_id = accelerator_cuda_get_device_id (mem_ctx );
311
+ }
273
312
}
313
+ #endif /* OPAL_CUDA_GET_ATTRIBUTES */
314
+
274
315
/* Must be a device pointer */
275
316
assert (CU_MEMORYTYPE_DEVICE == mem_type );
276
- #endif /* OPAL_CUDA_GET_ATTRIBUTES */
277
317
278
318
/* This piece of code was added in to handle in a case involving
279
319
* OMP threads. The user had initialized CUDA and then spawned
@@ -296,6 +336,16 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
296
336
return OPAL_ERROR ;
297
337
}
298
338
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
339
+ if (is_vmm ) {
340
+ /* This function is expected to set context if pointer is device
341
+ * accessible but VMM allocations have NULL context associated
342
+ * which cannot be set against the calling thread */
343
+ opal_output (0 ,
344
+ "CUDA: unable to set context with the given pointer"
345
+ "ptr=%p aborting..." , addr );
346
+ return OPAL_ERROR ;
347
+ }
348
+
299
349
result = cuCtxSetCurrent (mem_ctx );
300
350
if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
301
351
opal_output (0 ,
0 commit comments