4
4
5
5
from __future__ import annotations
6
6
7
+ from cuda.core.experimental._utils.cuda_utils cimport (
8
+ _check_driver_error as raise_if_driver_error,
9
+ )
10
+
7
11
import abc
8
- import weakref
9
12
from typing import Tuple, TypeVar, Union
10
13
11
14
from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
23
26
""" A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`."""
24
27
25
28
26
- class Buffer :
29
+ cdef class Buffer:
27
30
""" Represent a handle to allocated memory.
28
31
29
32
This generic object provides a unified representation for how
@@ -33,34 +36,26 @@ class Buffer:
33
36
Support for data interchange mechanisms are provided by DLPack.
34
37
"""
35
38
36
- class _MembersNeededForFinalize :
37
- __slots__ = ("ptr" , "size" , "mr" )
38
-
39
- def __init__ (self , buffer_obj , ptr , size , mr ):
40
- self .ptr = ptr
41
- self .size = size
42
- self .mr = mr
43
- weakref .finalize (buffer_obj , self .close )
44
-
45
- def close (self , stream = None ):
46
- if self .ptr and self .mr is not None :
47
- self .mr .deallocate (self .ptr , self .size , stream )
48
- self .ptr = 0
49
- self .mr = None
50
-
51
- # TODO: handle ownership? (_mr could be None)
52
- __slots__ = ("__weakref__" , "_mnff" )
39
+ cdef:
40
+ object _ptr
41
+ size_t _size
42
+ object _mr
53
43
54
- def __new__ (self , * args , ** kwargs ):
44
+ def __init__ (self , *args , **kwargs ):
55
45
raise RuntimeError (" Buffer objects cannot be instantiated directly. Please use MemoryResource APIs." )
56
46
57
47
@classmethod
58
- def _init (cls , ptr : DevicePointerT , size : int , mr : MemoryResource | None = None ):
59
- self = super ().__new__ (cls )
60
- self ._mnff = Buffer ._MembersNeededForFinalize (self , ptr , size , mr )
48
+ def _init (cls , ptr: DevicePointerT , size_t size , mr: MemoryResource | None = None ):
49
+ cdef Buffer self = Buffer.__new__ (cls )
50
+ self ._ptr = ptr
51
+ self ._size = size
52
+ self ._mr = mr
61
53
return self
62
54
63
- def close (self , stream : Stream = None ):
55
+ def __del__ (self ):
56
+ self .close()
57
+
58
+ cpdef close(self , stream: Stream = None ):
64
59
""" Deallocate this buffer asynchronously on the given stream.
65
60
66
61
This buffer is released back to their memory resource
@@ -72,7 +67,10 @@ def close(self, stream: Stream = None):
72
67
The stream object to use for asynchronous deallocation. If None,
73
68
the behavior depends on the underlying memory resource.
74
69
"""
75
- self ._mnff .close (stream )
70
+ if self ._ptr and self ._mr is not None :
71
+ self ._mr.deallocate(self ._ptr, self ._size, stream)
72
+ self ._ptr = 0
73
+ self ._mr = None
76
74
77
75
@property
78
76
def handle (self ) -> DevicePointerT:
@@ -83,37 +81,37 @@ def handle(self) -> DevicePointerT:
83
81
This handle is a Python object. To get the memory address of the underlying C
84
82
handle , call ``int(Buffer.handle )``.
85
83
"""
86
- return self ._mnff . ptr
84
+ return self._ptr
87
85
88
86
@property
89
87
def size(self ) -> int:
90
88
"""Return the memory size of this buffer."""
91
- return self ._mnff . size
89
+ return self._size
92
90
93
91
@property
94
92
def memory_resource(self ) -> MemoryResource:
95
93
"""Return the memory resource associated with this buffer."""
96
- return self ._mnff . mr
94
+ return self._mr
97
95
98
96
@property
99
97
def is_device_accessible(self ) -> bool:
100
98
"""Return True if this buffer can be accessed by the GPU , otherwise False."""
101
- if self ._mnff . mr is not None :
102
- return self ._mnff . mr .is_device_accessible
99
+ if self._mr is not None:
100
+ return self._mr .is_device_accessible
103
101
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
104
102
105
103
@property
106
104
def is_host_accessible(self ) -> bool:
107
105
"""Return True if this buffer can be accessed by the CPU , otherwise False."""
108
- if self ._mnff . mr is not None :
109
- return self ._mnff . mr .is_host_accessible
106
+ if self._mr is not None:
107
+ return self._mr .is_host_accessible
110
108
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
111
109
112
110
@property
113
111
def device_id(self ) -> int:
114
112
"""Return the device ordinal of this buffer."""
115
- if self ._mnff . mr is not None :
116
- return self ._mnff . mr .device_id
113
+ if self._mr is not None:
114
+ return self._mr .device_id
117
115
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
118
116
119
117
def copy_to(self , dst: Buffer = None , *, stream: Stream ) -> Buffer:
@@ -134,15 +132,20 @@ def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
134
132
"""
135
133
if stream is None:
136
134
raise ValueError("stream must be provided")
135
+
136
+ cdef size_t src_size = self ._size
137
+
137
138
if dst is None:
138
- if self ._mnff . mr is None :
139
+ if self._mr is None:
139
140
raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource )")
140
- dst = self ._mnff .mr .allocate (self ._mnff .size , stream )
141
- if dst ._mnff .size != self ._mnff .size :
141
+ dst = self ._mr.allocate(src_size, stream)
142
+
143
+ cdef size_t dst_size = dst._size
144
+ if dst_size != src_size:
142
145
raise ValueError(
143
- f"buffer sizes mismatch between src and dst (sizes are: src={ self . _mnff . size } , dst={ dst . _mnff . size } )"
146
+ f"buffer sizes mismatch between src and dst (sizes are: src = {src_size }, dst = {dst_size })"
144
147
)
145
- handle_return (driver .cuMemcpyAsync (dst ._mnff . ptr , self ._mnff . ptr , self . _mnff . size , stream .handle ))
148
+ handle_return(driver.cuMemcpyAsync(dst._ptr , self._ptr , src_size , stream.handle ))
146
149
return dst
147
150
148
151
def copy_from(self , src: Buffer , *, stream: Stream ):
@@ -159,11 +162,15 @@ def copy_from(self, src: Buffer, *, stream: Stream):
159
162
"""
160
163
if stream is None :
161
164
raise ValueError (" stream must be provided" )
162
- if src ._mnff .size != self ._mnff .size :
165
+
166
+ cdef size_t dst_size = self ._size
167
+ cdef size_t src_size = src._size
168
+
169
+ if src_size != dst_size:
163
170
raise ValueError (
164
- f"buffer sizes mismatch between src and dst (sizes are: src={ src . _mnff . size } , dst={ self . _mnff . size } )"
171
+ f" buffer sizes mismatch between src and dst (sizes are: src={src_size }, dst={dst_size })"
165
172
)
166
- handle_return (driver .cuMemcpyAsync (self ._mnff . ptr , src ._mnff . ptr , self . _mnff . size , stream .handle ))
173
+ handle_return(driver.cuMemcpyAsync(self ._ptr , src._ptr, dst_size , stream.handle))
167
174
168
175
def __dlpack__ (
169
176
self ,
@@ -211,7 +218,7 @@ def __release_buffer__(self, buffer: memoryview, /):
211
218
raise NotImplementedError (" WIP: Buffer.__release_buffer__ hasn't been implemented yet." )
212
219
213
220
@staticmethod
214
- def from_handle (ptr : DevicePointerT , size : int , mr : MemoryResource | None = None ) -> Buffer :
221
+ def from_handle (ptr: DevicePointerT , size_t size , mr: MemoryResource | None = None ) -> Buffer:
215
222
"""Create a new :class:`Buffer` object from a pointer.
216
223
217
224
Parameters
@@ -326,23 +333,6 @@ def __init__(self, device_id: int):
326
333
self ._handle = handle_return(driver.cuDeviceGetMemPool(device_id))
327
334
self ._dev_id = device_id
328
335
329
- # Set a higher release threshold to improve performance when there are no active allocations.
330
- # By default, the release threshold is 0, which means memory is immediately released back
331
- # to the OS when there are no active suballocations, causing performance issues.
332
- # Check current release threshold
333
- current_threshold = handle_return (
334
- driver .cuMemPoolGetAttribute (self ._handle , driver .CUmemPool_attribute .CU_MEMPOOL_ATTR_RELEASE_THRESHOLD )
335
- )
336
- # If threshold is 0 (default), set it to maximum to retain memory in the pool
337
- if int (current_threshold ) == 0 :
338
- handle_return (
339
- driver .cuMemPoolSetAttribute (
340
- self ._handle ,
341
- driver .CUmemPool_attribute .CU_MEMPOOL_ATTR_RELEASE_THRESHOLD ,
342
- driver .cuuint64_t (0xFFFFFFFFFFFFFFFF ),
343
- )
344
- )
345
-
346
336
def allocate (self , size: int , stream: Stream = None ) -> Buffer:
347
337
"""Allocate a buffer of the requested size.
348
338
0 commit comments