4
4
5
5
from __future__ import annotations
6
6
7
+ from libc.stdint cimport uintptr_t
8
+
9
+ from cuda.core.experimental._utils.cuda_utils cimport (
10
+ _check_driver_error as raise_if_driver_error,
11
+ )
12
+
7
13
import abc
8
- import weakref
9
14
from typing import Tuple, TypeVar, Union
10
15
11
16
from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
12
17
from cuda.core.experimental._stream import Stream, default_stream
13
- from cuda .core .experimental ._utils .cuda_utils import driver , handle_return
18
+ from cuda.core.experimental._utils.cuda_utils import driver
14
19
15
20
# TODO: define a memory property mixin class and make Buffer and
16
21
# MemoryResource both inherit from it
23
28
""" A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`."""
24
29
25
30
26
- class Buffer :
31
+ cdef class Buffer:
27
32
""" Represent a handle to allocated memory.
28
33
29
34
This generic object provides a unified representation for how
@@ -33,34 +38,28 @@ class Buffer:
33
38
Support for data interchange mechanisms are provided by DLPack.
34
39
"""
35
40
36
- class _MembersNeededForFinalize :
37
- __slots__ = ("ptr" , "size" , "mr" )
38
-
39
- def __init__ (self , buffer_obj , ptr , size , mr ):
40
- self .ptr = ptr
41
- self .size = size
42
- self .mr = mr
43
- weakref .finalize (buffer_obj , self .close )
44
-
45
- def close (self , stream = None ):
46
- if self .ptr and self .mr is not None :
47
- self .mr .deallocate (self .ptr , self .size , stream )
48
- self .ptr = 0
49
- self .mr = None
41
+ cdef:
42
+ uintptr_t _ptr
43
+ size_t _size
44
+ object _mr
45
+ object _ptr_obj
50
46
51
- # TODO: handle ownership? (_mr could be None)
52
- __slots__ = ("__weakref__" , "_mnff" )
53
-
54
- def __new__ (self , * args , ** kwargs ):
47
+ def __init__ (self , *args , **kwargs ):
55
48
raise RuntimeError (" Buffer objects cannot be instantiated directly. Please use MemoryResource APIs." )
56
49
57
50
@classmethod
58
- def _init (cls , ptr : DevicePointerT , size : int , mr : MemoryResource | None = None ):
59
- self = super ().__new__ (cls )
60
- self ._mnff = Buffer ._MembersNeededForFinalize (self , ptr , size , mr )
51
+ def _init (cls , ptr: DevicePointerT , size_t size , mr: MemoryResource | None = None ):
52
+ cdef Buffer self = Buffer.__new__ (cls )
53
+ self ._ptr = < uintptr_t> (int (ptr))
54
+ self ._ptr_obj = ptr
55
+ self ._size = size
56
+ self ._mr = mr
61
57
return self
62
58
63
- def close (self , stream : Stream = None ):
59
+ def __del__ (self ):
60
+ self .close()
61
+
62
+ cpdef close(self , stream: Stream = None ):
64
63
""" Deallocate this buffer asynchronously on the given stream.
65
64
66
65
This buffer is released back to their memory resource
@@ -72,7 +71,11 @@ def close(self, stream: Stream = None):
72
71
The stream object to use for asynchronous deallocation. If None,
73
72
the behavior depends on the underlying memory resource.
74
73
"""
75
- self ._mnff .close (stream )
74
+ if self ._ptr and self ._mr is not None :
75
+ self ._mr.deallocate(self ._ptr, self ._size, stream)
76
+ self ._ptr = 0
77
+ self ._mr = None
78
+ self ._ptr_obj = None
76
79
77
80
@property
78
81
def handle (self ) -> DevicePointerT:
@@ -83,37 +86,37 @@ def handle(self) -> DevicePointerT:
83
86
This handle is a Python object. To get the memory address of the underlying C
84
87
handle , call ``int(Buffer.handle )``.
85
88
"""
86
- return self ._mnff . ptr
89
+ return self._ptr_obj
87
90
88
91
@property
89
92
def size(self ) -> int:
90
93
"""Return the memory size of this buffer."""
91
- return self ._mnff . size
94
+ return self._size
92
95
93
96
@property
94
97
def memory_resource(self ) -> MemoryResource:
95
98
"""Return the memory resource associated with this buffer."""
96
- return self ._mnff . mr
99
+ return self._mr
97
100
98
101
@property
99
102
def is_device_accessible(self ) -> bool:
100
103
"""Return True if this buffer can be accessed by the GPU , otherwise False."""
101
- if self ._mnff . mr is not None :
102
- return self ._mnff . mr .is_device_accessible
104
+ if self._mr is not None:
105
+ return self._mr .is_device_accessible
103
106
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
104
107
105
108
@property
106
109
def is_host_accessible(self ) -> bool:
107
110
"""Return True if this buffer can be accessed by the CPU , otherwise False."""
108
- if self ._mnff . mr is not None :
109
- return self ._mnff . mr .is_host_accessible
111
+ if self._mr is not None:
112
+ return self._mr .is_host_accessible
110
113
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
111
114
112
115
@property
113
116
def device_id(self ) -> int:
114
117
"""Return the device ordinal of this buffer."""
115
- if self ._mnff . mr is not None :
116
- return self ._mnff . mr .device_id
118
+ if self._mr is not None:
119
+ return self._mr .device_id
117
120
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
118
121
119
122
def copy_to(self , dst: Buffer = None , *, stream: Stream ) -> Buffer:
@@ -134,15 +137,21 @@ def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
134
137
"""
135
138
if stream is None:
136
139
raise ValueError("stream must be provided")
140
+
141
+ cdef size_t src_size = self ._size
142
+
137
143
if dst is None:
138
- if self ._mnff . mr is None :
144
+ if self._mr is None:
139
145
raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource )")
140
- dst = self ._mnff .mr .allocate (self ._mnff .size , stream )
141
- if dst ._mnff .size != self ._mnff .size :
146
+ dst = self ._mr.allocate(src_size, stream)
147
+
148
+ cdef size_t dst_size = dst._size
149
+ if dst_size != src_size:
142
150
raise ValueError(
143
- f"buffer sizes mismatch between src and dst (sizes are: src={ self . _mnff . size } , dst={ dst . _mnff . size } )"
151
+ f"buffer sizes mismatch between src and dst (sizes are: src = {src_size }, dst = {dst_size })"
144
152
)
145
- handle_return (driver .cuMemcpyAsync (dst ._mnff .ptr , self ._mnff .ptr , self ._mnff .size , stream .handle ))
153
+ err , = driver.cuMemcpyAsync(dst._ptr , self._ptr , src_size , stream.handle )
154
+ raise_if_driver_error(err )
146
155
return dst
147
156
148
157
def copy_from(self , src: Buffer , *, stream: Stream ):
@@ -159,11 +168,16 @@ def copy_from(self, src: Buffer, *, stream: Stream):
159
168
"""
160
169
if stream is None :
161
170
raise ValueError (" stream must be provided" )
162
- if src ._mnff .size != self ._mnff .size :
171
+
172
+ cdef size_t dst_size = self ._size
173
+ cdef size_t src_size = src._size
174
+
175
+ if src_size != dst_size:
163
176
raise ValueError (
164
- f"buffer sizes mismatch between src and dst (sizes are: src={ src . _mnff . size } , dst={ self . _mnff . size } )"
177
+ f" buffer sizes mismatch between src and dst (sizes are: src={src_size }, dst={dst_size })"
165
178
)
166
- handle_return (driver .cuMemcpyAsync (self ._mnff .ptr , src ._mnff .ptr , self ._mnff .size , stream .handle ))
179
+ err, = driver.cuMemcpyAsync(self ._ptr, src._ptr, dst_size, stream.handle)
180
+ raise_if_driver_error(err)
167
181
168
182
def __dlpack__ (
169
183
self ,
@@ -189,13 +203,14 @@ def __dlpack__(
189
203
return capsule
190
204
191
205
def __dlpack_device__(self ) -> Tuple[int , int]:
192
- d_h = (bool (self .is_device_accessible ), bool (self .is_host_accessible ))
193
- if d_h == (True , False ):
206
+ cdef bint d = self .is_device_accessible
207
+ cdef bint h = self .is_host_accessible
208
+ if d and (not h ):
194
209
return (DLDeviceType.kDLCUDA, self .device_id)
195
- if d_h == ( True , True ) :
210
+ if d and h :
196
211
# TODO: this can also be kDLCUDAManaged, we need more fine-grained checks
197
212
return (DLDeviceType.kDLCUDAHost, 0 )
198
- if d_h == ( False , True ) :
213
+ if ( not d) and h :
199
214
return (DLDeviceType.kDLCPU, 0 )
200
215
raise BufferError(" buffer is neither device-accessible nor host-accessible" )
201
216
@@ -211,7 +226,7 @@ def __release_buffer__(self, buffer: memoryview, /):
211
226
raise NotImplementedError (" WIP: Buffer.__release_buffer__ hasn't been implemented yet." )
212
227
213
228
@staticmethod
214
- def from_handle (ptr : DevicePointerT , size : int , mr : MemoryResource | None = None ) -> Buffer :
229
+ def from_handle (ptr: DevicePointerT , size_t size , mr: MemoryResource | None = None ) -> Buffer:
215
230
"""Create a new :class:`Buffer` object from a pointer.
216
231
217
232
Parameters
@@ -247,7 +262,7 @@ def __init__(self, *args, **kwargs):
247
262
...
248
263
249
264
@abc.abstractmethod
250
- def allocate (self , size : int , stream : Stream = None ) -> Buffer :
265
+ def allocate (self , size_t size , stream: Stream = None ) -> Buffer:
251
266
"""Allocate a buffer of the requested size.
252
267
253
268
Parameters
@@ -268,7 +283,7 @@ def allocate(self, size: int, stream: Stream = None) -> Buffer:
268
283
...
269
284
270
285
@abc.abstractmethod
271
- def deallocate (self , ptr : DevicePointerT , size : int , stream : Stream = None ):
286
+ def deallocate(self , ptr: DevicePointerT , size_t size , stream: Stream = None ):
272
287
""" Deallocate a buffer previously allocated by this resource.
273
288
274
289
Parameters
@@ -323,27 +338,28 @@ class DeviceMemoryResource(MemoryResource):
323
338
__slots__ = (" _dev_id" ,)
324
339
325
340
def __init__ (self , device_id: int ):
326
- self ._handle = handle_return (driver .cuDeviceGetMemPool (device_id ))
341
+ err, self ._handle = driver.cuDeviceGetMemPool(device_id)
342
+ raise_if_driver_error(err)
327
343
self ._dev_id = device_id
328
344
329
345
# Set a higher release threshold to improve performance when there are no active allocations.
330
346
# By default, the release threshold is 0, which means memory is immediately released back
331
347
# to the OS when there are no active suballocations, causing performance issues.
332
348
# Check current release threshold
333
- current_threshold = handle_return (
334
- driver . cuMemPoolGetAttribute ( self ._handle , driver .CUmemPool_attribute .CU_MEMPOOL_ATTR_RELEASE_THRESHOLD )
349
+ err, current_threshold = driver.cuMemPoolGetAttribute (
350
+ self ._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD
335
351
)
352
+ raise_if_driver_error(err)
336
353
# If threshold is 0 (default), set it to maximum to retain memory in the pool
337
354
if int (current_threshold) == 0 :
338
- handle_return (
339
- driver .cuMemPoolSetAttribute (
340
- self ._handle ,
341
- driver .CUmemPool_attribute .CU_MEMPOOL_ATTR_RELEASE_THRESHOLD ,
342
- driver .cuuint64_t (0xFFFFFFFFFFFFFFFF ),
343
- )
355
+ err, = driver.cuMemPoolSetAttribute(
356
+ self ._handle,
357
+ driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
358
+ driver.cuuint64_t(0xFFFFFFFFFFFFFFFF ),
344
359
)
360
+ raise_if_driver_error(err)
345
361
346
- def allocate (self , size : int , stream : Stream = None ) -> Buffer :
362
+ def allocate (self , size_t size , stream: Stream = None ) -> Buffer:
347
363
"""Allocate a buffer of the requested size.
348
364
349
365
Parameters
@@ -362,10 +378,11 @@ def allocate(self, size: int, stream: Stream = None) -> Buffer:
362
378
"""
363
379
if stream is None:
364
380
stream = default_stream()
365
- ptr = handle_return (driver .cuMemAllocFromPoolAsync (size , self ._handle , stream .handle ))
381
+ err , ptr = driver.cuMemAllocFromPoolAsync(size, self ._handle, stream.handle)
382
+ raise_if_driver_error(err )
366
383
return Buffer._init(ptr , size , self )
367
384
368
- def deallocate (self , ptr : DevicePointerT , size : int , stream : Stream = None ):
385
+ def deallocate(self , ptr: DevicePointerT , size_t size , stream: Stream = None ):
369
386
""" Deallocate a buffer previously allocated by this resource.
370
387
371
388
Parameters
@@ -380,7 +397,8 @@ def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None):
380
397
"""
381
398
if stream is None :
382
399
stream = default_stream()
383
- handle_return (driver .cuMemFreeAsync (ptr , stream .handle ))
400
+ err, = driver.cuMemFreeAsync(ptr, stream.handle)
401
+ raise_if_driver_error(err)
384
402
385
403
@property
386
404
def is_device_accessible (self ) -> bool:
@@ -407,7 +425,7 @@ def __init__(self):
407
425
# TODO: support flags from cuMemHostAlloc?
408
426
self ._handle = None
409
427
410
- def allocate (self , size : int , stream : Stream = None ) -> Buffer :
428
+ def allocate (self , size_t size , stream: Stream = None ) -> Buffer:
411
429
"""Allocate a buffer of the requested size.
412
430
413
431
Parameters
@@ -422,10 +440,11 @@ def allocate(self, size: int, stream: Stream = None) -> Buffer:
422
440
Buffer
423
441
The allocated buffer object , which is accessible on both host and device.
424
442
"""
425
- ptr = handle_return (driver .cuMemAllocHost (size ))
443
+ err , ptr = driver.cuMemAllocHost(size)
444
+ raise_if_driver_error(err )
426
445
return Buffer._init(ptr , size , self )
427
446
428
- def deallocate (self , ptr : DevicePointerT , size : int , stream : Stream = None ):
447
+ def deallocate(self , ptr: DevicePointerT , size_t size , stream: Stream = None ):
429
448
""" Deallocate a buffer previously allocated by this resource.
430
449
431
450
Parameters
@@ -440,7 +459,8 @@ def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None):
440
459
"""
441
460
if stream:
442
461
stream.sync()
443
- handle_return (driver .cuMemFreeHost (ptr ))
462
+ err, = driver.cuMemFreeHost(ptr)
463
+ raise_if_driver_error(err)
444
464
445
465
@property
446
466
def is_device_accessible (self ) -> bool:
@@ -466,14 +486,16 @@ def __init__(self, device_id):
466
486
self ._dev_id = device_id
467
487
468
488
def allocate (self , size , stream = None ) -> Buffer:
469
- ptr = handle_return (driver .cuMemAlloc (size ))
489
+ err , ptr = driver.cuMemAlloc(size)
490
+ raise_if_driver_error(err )
470
491
return Buffer._init(ptr , size , self )
471
492
472
493
def deallocate(self , ptr , size , stream = None ):
473
494
if stream is None :
474
495
stream = default_stream()
475
496
stream.sync()
476
- handle_return (driver .cuMemFree (ptr ))
497
+ err, = driver.cuMemFree(ptr)
498
+ raise_if_driver_error(err)
477
499
478
500
@property
479
501
def is_device_accessible (self ) -> bool:
0 commit comments