Skip to content

Commit 93fd3cf

Browse files
committed
cythonize _memory.py
1 parent 33a1110 commit 93fd3cf

File tree

1 file changed

+50
-60
lines changed

1 file changed

+50
-60
lines changed

cuda_core/cuda/core/experimental/_memory.py renamed to cuda_core/cuda/core/experimental/_memory.pyx

Lines changed: 50 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@
44

55
from __future__ import annotations
66

7+
from cuda.core.experimental._utils.cuda_utils cimport (
8+
_check_driver_error as raise_if_driver_error,
9+
)
10+
711
import abc
8-
import weakref
912
from typing import Tuple, TypeVar, Union
1013

1114
from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
@@ -23,7 +26,7 @@
2326
"""A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`."""
2427

2528

26-
class Buffer:
29+
cdef class Buffer:
2730
"""Represent a handle to allocated memory.
2831
2932
This generic object provides a unified representation for how
@@ -33,34 +36,26 @@ class Buffer:
3336
Support for data interchange mechanisms are provided by DLPack.
3437
"""
3538

36-
class _MembersNeededForFinalize:
37-
__slots__ = ("ptr", "size", "mr")
38-
39-
def __init__(self, buffer_obj, ptr, size, mr):
40-
self.ptr = ptr
41-
self.size = size
42-
self.mr = mr
43-
weakref.finalize(buffer_obj, self.close)
44-
45-
def close(self, stream=None):
46-
if self.ptr and self.mr is not None:
47-
self.mr.deallocate(self.ptr, self.size, stream)
48-
self.ptr = 0
49-
self.mr = None
50-
51-
# TODO: handle ownership? (_mr could be None)
52-
__slots__ = ("__weakref__", "_mnff")
39+
cdef:
40+
object _ptr
41+
size_t _size
42+
object _mr
5343

54-
def __new__(self, *args, **kwargs):
44+
def __init__(self, *args, **kwargs):
5545
raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.")
5646

5747
@classmethod
58-
def _init(cls, ptr: DevicePointerT, size: int, mr: MemoryResource | None = None):
59-
self = super().__new__(cls)
60-
self._mnff = Buffer._MembersNeededForFinalize(self, ptr, size, mr)
48+
def _init(cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None):
49+
cdef Buffer self = Buffer.__new__(cls)
50+
self._ptr = ptr
51+
self._size = size
52+
self._mr = mr
6153
return self
6254

63-
def close(self, stream: Stream = None):
55+
def __del__(self):
56+
self.close()
57+
58+
cpdef close(self, stream: Stream = None):
6459
"""Deallocate this buffer asynchronously on the given stream.
6560
6661
This buffer is released back to their memory resource
@@ -72,7 +67,10 @@ def close(self, stream: Stream = None):
7267
The stream object to use for asynchronous deallocation. If None,
7368
the behavior depends on the underlying memory resource.
7469
"""
75-
self._mnff.close(stream)
70+
if self._ptr and self._mr is not None:
71+
self._mr.deallocate(self._ptr, self._size, stream)
72+
self._ptr = 0
73+
self._mr = None
7674

7775
@property
7876
def handle(self) -> DevicePointerT:
@@ -83,37 +81,37 @@ def handle(self) -> DevicePointerT:
8381
This handle is a Python object. To get the memory address of the underlying C
8482
handle, call ``int(Buffer.handle)``.
8583
"""
86-
return self._mnff.ptr
84+
return self._ptr
8785

8886
@property
8987
def size(self) -> int:
9088
"""Return the memory size of this buffer."""
91-
return self._mnff.size
89+
return self._size
9290

9391
@property
9492
def memory_resource(self) -> MemoryResource:
9593
"""Return the memory resource associated with this buffer."""
96-
return self._mnff.mr
94+
return self._mr
9795

9896
@property
9997
def is_device_accessible(self) -> bool:
10098
"""Return True if this buffer can be accessed by the GPU, otherwise False."""
101-
if self._mnff.mr is not None:
102-
return self._mnff.mr.is_device_accessible
99+
if self._mr is not None:
100+
return self._mr.is_device_accessible
103101
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
104102

105103
@property
106104
def is_host_accessible(self) -> bool:
107105
"""Return True if this buffer can be accessed by the CPU, otherwise False."""
108-
if self._mnff.mr is not None:
109-
return self._mnff.mr.is_host_accessible
106+
if self._mr is not None:
107+
return self._mr.is_host_accessible
110108
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
111109

112110
@property
113111
def device_id(self) -> int:
114112
"""Return the device ordinal of this buffer."""
115-
if self._mnff.mr is not None:
116-
return self._mnff.mr.device_id
113+
if self._mr is not None:
114+
return self._mr.device_id
117115
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
118116

119117
def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
@@ -134,15 +132,20 @@ def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
134132
"""
135133
if stream is None:
136134
raise ValueError("stream must be provided")
135+
136+
cdef size_t src_size = self._size
137+
137138
if dst is None:
138-
if self._mnff.mr is None:
139+
if self._mr is None:
139140
raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)")
140-
dst = self._mnff.mr.allocate(self._mnff.size, stream)
141-
if dst._mnff.size != self._mnff.size:
141+
dst = self._mr.allocate(src_size, stream)
142+
143+
cdef size_t dst_size = dst._size
144+
if dst_size != src_size:
142145
raise ValueError(
143-
f"buffer sizes mismatch between src and dst (sizes are: src={self._mnff.size}, dst={dst._mnff.size})"
146+
f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})"
144147
)
145-
handle_return(driver.cuMemcpyAsync(dst._mnff.ptr, self._mnff.ptr, self._mnff.size, stream.handle))
148+
handle_return(driver.cuMemcpyAsync(dst._ptr, self._ptr, src_size, stream.handle))
146149
return dst
147150

148151
def copy_from(self, src: Buffer, *, stream: Stream):
@@ -159,11 +162,15 @@ def copy_from(self, src: Buffer, *, stream: Stream):
159162
"""
160163
if stream is None:
161164
raise ValueError("stream must be provided")
162-
if src._mnff.size != self._mnff.size:
165+
166+
cdef size_t dst_size = self._size
167+
cdef size_t src_size = src._size
168+
169+
if src_size != dst_size:
163170
raise ValueError(
164-
f"buffer sizes mismatch between src and dst (sizes are: src={src._mnff.size}, dst={self._mnff.size})"
171+
f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})"
165172
)
166-
handle_return(driver.cuMemcpyAsync(self._mnff.ptr, src._mnff.ptr, self._mnff.size, stream.handle))
173+
handle_return(driver.cuMemcpyAsync(self._ptr, src._ptr, dst_size, stream.handle))
167174

168175
def __dlpack__(
169176
self,
@@ -211,7 +218,7 @@ def __release_buffer__(self, buffer: memoryview, /):
211218
raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.")
212219

213220
@staticmethod
214-
def from_handle(ptr: DevicePointerT, size: int, mr: MemoryResource | None = None) -> Buffer:
221+
def from_handle(ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None) -> Buffer:
215222
"""Create a new :class:`Buffer` object from a pointer.
216223

217224
Parameters
@@ -326,23 +333,6 @@ def __init__(self, device_id: int):
326333
self._handle = handle_return(driver.cuDeviceGetMemPool(device_id))
327334
self._dev_id = device_id
328335

329-
# Set a higher release threshold to improve performance when there are no active allocations.
330-
# By default, the release threshold is 0, which means memory is immediately released back
331-
# to the OS when there are no active suballocations, causing performance issues.
332-
# Check current release threshold
333-
current_threshold = handle_return(
334-
driver.cuMemPoolGetAttribute(self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD)
335-
)
336-
# If threshold is 0 (default), set it to maximum to retain memory in the pool
337-
if int(current_threshold) == 0:
338-
handle_return(
339-
driver.cuMemPoolSetAttribute(
340-
self._handle,
341-
driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
342-
driver.cuuint64_t(0xFFFFFFFFFFFFFFFF),
343-
)
344-
)
345-
346336
def allocate(self, size: int, stream: Stream = None) -> Buffer:
347337
"""Allocate a buffer of the requested size.
348338

0 commit comments

Comments
 (0)