Skip to content

Commit e3a9f22

Browse files
Copilotleofang
andauthored
Cythonize Buffer and MemoryResource classes for performance optimization (#876)
* cythonize _memory.py * fix * reduce overhead * tripped again: Cython enforces type annotations at compile time * ensure Buffer.handle is None everywhere * fix test * nit: avoid extra tuple * update tests to comply with spec and make Cython 3.1 happy --------- Co-authored-by: Leo Fang <[email protected]>
1 parent 459bbcb commit e3a9f22

File tree

5 files changed

+108
-82
lines changed

5 files changed

+108
-82
lines changed

cuda_core/cuda/core/experimental/_memory.py renamed to cuda_core/cuda/core/experimental/_memory.pyx

Lines changed: 91 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,18 @@
44

55
from __future__ import annotations
66

7+
from libc.stdint cimport uintptr_t
8+
9+
from cuda.core.experimental._utils.cuda_utils cimport (
10+
_check_driver_error as raise_if_driver_error,
11+
)
12+
713
import abc
8-
import weakref
914
from typing import Tuple, TypeVar, Union
1015

1116
from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
1217
from cuda.core.experimental._stream import Stream, default_stream
13-
from cuda.core.experimental._utils.cuda_utils import driver, handle_return
18+
from cuda.core.experimental._utils.cuda_utils import driver
1419

1520
# TODO: define a memory property mixin class and make Buffer and
1621
# MemoryResource both inherit from it
@@ -23,7 +28,7 @@
2328
"""A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`."""
2429

2530

26-
class Buffer:
31+
cdef class Buffer:
2732
"""Represent a handle to allocated memory.
2833
2934
This generic object provides a unified representation for how
@@ -33,34 +38,28 @@ class Buffer:
3338
Support for data interchange mechanisms are provided by DLPack.
3439
"""
3540

36-
class _MembersNeededForFinalize:
37-
__slots__ = ("ptr", "size", "mr")
38-
39-
def __init__(self, buffer_obj, ptr, size, mr):
40-
self.ptr = ptr
41-
self.size = size
42-
self.mr = mr
43-
weakref.finalize(buffer_obj, self.close)
44-
45-
def close(self, stream=None):
46-
if self.ptr and self.mr is not None:
47-
self.mr.deallocate(self.ptr, self.size, stream)
48-
self.ptr = 0
49-
self.mr = None
41+
cdef:
42+
uintptr_t _ptr
43+
size_t _size
44+
object _mr
45+
object _ptr_obj
5046

51-
# TODO: handle ownership? (_mr could be None)
52-
__slots__ = ("__weakref__", "_mnff")
53-
54-
def __new__(self, *args, **kwargs):
47+
def __init__(self, *args, **kwargs):
5548
raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.")
5649

5750
@classmethod
58-
def _init(cls, ptr: DevicePointerT, size: int, mr: MemoryResource | None = None):
59-
self = super().__new__(cls)
60-
self._mnff = Buffer._MembersNeededForFinalize(self, ptr, size, mr)
51+
def _init(cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None):
52+
cdef Buffer self = Buffer.__new__(cls)
53+
self._ptr = <uintptr_t>(int(ptr))
54+
self._ptr_obj = ptr
55+
self._size = size
56+
self._mr = mr
6157
return self
6258

63-
def close(self, stream: Stream = None):
59+
def __del__(self):
60+
self.close()
61+
62+
cpdef close(self, stream: Stream = None):
6463
"""Deallocate this buffer asynchronously on the given stream.
6564
6665
This buffer is released back to their memory resource
@@ -72,7 +71,11 @@ def close(self, stream: Stream = None):
7271
The stream object to use for asynchronous deallocation. If None,
7372
the behavior depends on the underlying memory resource.
7473
"""
75-
self._mnff.close(stream)
74+
if self._ptr and self._mr is not None:
75+
self._mr.deallocate(self._ptr, self._size, stream)
76+
self._ptr = 0
77+
self._mr = None
78+
self._ptr_obj = None
7679

7780
@property
7881
def handle(self) -> DevicePointerT:
@@ -83,37 +86,37 @@ def handle(self) -> DevicePointerT:
8386
This handle is a Python object. To get the memory address of the underlying C
8487
handle, call ``int(Buffer.handle)``.
8588
"""
86-
return self._mnff.ptr
89+
return self._ptr_obj
8790

8891
@property
8992
def size(self) -> int:
9093
"""Return the memory size of this buffer."""
91-
return self._mnff.size
94+
return self._size
9295

9396
@property
9497
def memory_resource(self) -> MemoryResource:
9598
"""Return the memory resource associated with this buffer."""
96-
return self._mnff.mr
99+
return self._mr
97100

98101
@property
99102
def is_device_accessible(self) -> bool:
100103
"""Return True if this buffer can be accessed by the GPU, otherwise False."""
101-
if self._mnff.mr is not None:
102-
return self._mnff.mr.is_device_accessible
104+
if self._mr is not None:
105+
return self._mr.is_device_accessible
103106
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
104107

105108
@property
106109
def is_host_accessible(self) -> bool:
107110
"""Return True if this buffer can be accessed by the CPU, otherwise False."""
108-
if self._mnff.mr is not None:
109-
return self._mnff.mr.is_host_accessible
111+
if self._mr is not None:
112+
return self._mr.is_host_accessible
110113
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
111114

112115
@property
113116
def device_id(self) -> int:
114117
"""Return the device ordinal of this buffer."""
115-
if self._mnff.mr is not None:
116-
return self._mnff.mr.device_id
118+
if self._mr is not None:
119+
return self._mr.device_id
117120
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
118121

119122
def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
@@ -134,15 +137,21 @@ def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
134137
"""
135138
if stream is None:
136139
raise ValueError("stream must be provided")
140+
141+
cdef size_t src_size = self._size
142+
137143
if dst is None:
138-
if self._mnff.mr is None:
144+
if self._mr is None:
139145
raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)")
140-
dst = self._mnff.mr.allocate(self._mnff.size, stream)
141-
if dst._mnff.size != self._mnff.size:
146+
dst = self._mr.allocate(src_size, stream)
147+
148+
cdef size_t dst_size = dst._size
149+
if dst_size != src_size:
142150
raise ValueError(
143-
f"buffer sizes mismatch between src and dst (sizes are: src={self._mnff.size}, dst={dst._mnff.size})"
151+
f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})"
144152
)
145-
handle_return(driver.cuMemcpyAsync(dst._mnff.ptr, self._mnff.ptr, self._mnff.size, stream.handle))
153+
err, = driver.cuMemcpyAsync(dst._ptr, self._ptr, src_size, stream.handle)
154+
raise_if_driver_error(err)
146155
return dst
147156

148157
def copy_from(self, src: Buffer, *, stream: Stream):
@@ -159,11 +168,16 @@ def copy_from(self, src: Buffer, *, stream: Stream):
159168
"""
160169
if stream is None:
161170
raise ValueError("stream must be provided")
162-
if src._mnff.size != self._mnff.size:
171+
172+
cdef size_t dst_size = self._size
173+
cdef size_t src_size = src._size
174+
175+
if src_size != dst_size:
163176
raise ValueError(
164-
f"buffer sizes mismatch between src and dst (sizes are: src={src._mnff.size}, dst={self._mnff.size})"
177+
f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})"
165178
)
166-
handle_return(driver.cuMemcpyAsync(self._mnff.ptr, src._mnff.ptr, self._mnff.size, stream.handle))
179+
err, = driver.cuMemcpyAsync(self._ptr, src._ptr, dst_size, stream.handle)
180+
raise_if_driver_error(err)
167181

168182
def __dlpack__(
169183
self,
@@ -189,13 +203,14 @@ def __dlpack__(
189203
return capsule
190204

191205
def __dlpack_device__(self) -> Tuple[int, int]:
192-
d_h = (bool(self.is_device_accessible), bool(self.is_host_accessible))
193-
if d_h == (True, False):
206+
cdef bint d = self.is_device_accessible
207+
cdef bint h = self.is_host_accessible
208+
if d and (not h):
194209
return (DLDeviceType.kDLCUDA, self.device_id)
195-
if d_h == (True, True):
210+
if d and h:
196211
# TODO: this can also be kDLCUDAManaged, we need more fine-grained checks
197212
return (DLDeviceType.kDLCUDAHost, 0)
198-
if d_h == (False, True):
213+
if (not d) and h:
199214
return (DLDeviceType.kDLCPU, 0)
200215
raise BufferError("buffer is neither device-accessible nor host-accessible")
201216

@@ -211,7 +226,7 @@ def __release_buffer__(self, buffer: memoryview, /):
211226
raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.")
212227

213228
@staticmethod
214-
def from_handle(ptr: DevicePointerT, size: int, mr: MemoryResource | None = None) -> Buffer:
229+
def from_handle(ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None) -> Buffer:
215230
"""Create a new :class:`Buffer` object from a pointer.
216231

217232
Parameters
@@ -247,7 +262,7 @@ def __init__(self, *args, **kwargs):
247262
...
248263

249264
@abc.abstractmethod
250-
def allocate(self, size: int, stream: Stream = None) -> Buffer:
265+
def allocate(self, size_t size, stream: Stream = None) -> Buffer:
251266
"""Allocate a buffer of the requested size.
252267

253268
Parameters
@@ -268,7 +283,7 @@ def allocate(self, size: int, stream: Stream = None) -> Buffer:
268283
...
269284

270285
@abc.abstractmethod
271-
def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None):
286+
def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
272287
"""Deallocate a buffer previously allocated by this resource.
273288
274289
Parameters
@@ -323,27 +338,28 @@ class DeviceMemoryResource(MemoryResource):
323338
__slots__ = ("_dev_id",)
324339

325340
def __init__(self, device_id: int):
326-
self._handle = handle_return(driver.cuDeviceGetMemPool(device_id))
341+
err, self._handle = driver.cuDeviceGetMemPool(device_id)
342+
raise_if_driver_error(err)
327343
self._dev_id = device_id
328344

329345
# Set a higher release threshold to improve performance when there are no active allocations.
330346
# By default, the release threshold is 0, which means memory is immediately released back
331347
# to the OS when there are no active suballocations, causing performance issues.
332348
# Check current release threshold
333-
current_threshold = handle_return(
334-
driver.cuMemPoolGetAttribute(self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD)
349+
err, current_threshold = driver.cuMemPoolGetAttribute(
350+
self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD
335351
)
352+
raise_if_driver_error(err)
336353
# If threshold is 0 (default), set it to maximum to retain memory in the pool
337354
if int(current_threshold) == 0:
338-
handle_return(
339-
driver.cuMemPoolSetAttribute(
340-
self._handle,
341-
driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
342-
driver.cuuint64_t(0xFFFFFFFFFFFFFFFF),
343-
)
355+
err, = driver.cuMemPoolSetAttribute(
356+
self._handle,
357+
driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
358+
driver.cuuint64_t(0xFFFFFFFFFFFFFFFF),
344359
)
360+
raise_if_driver_error(err)
345361

346-
def allocate(self, size: int, stream: Stream = None) -> Buffer:
362+
def allocate(self, size_t size, stream: Stream = None) -> Buffer:
347363
"""Allocate a buffer of the requested size.
348364

349365
Parameters
@@ -362,10 +378,11 @@ def allocate(self, size: int, stream: Stream = None) -> Buffer:
362378
"""
363379
if stream is None:
364380
stream = default_stream()
365-
ptr = handle_return(driver.cuMemAllocFromPoolAsync(size, self._handle, stream.handle))
381+
err, ptr = driver.cuMemAllocFromPoolAsync(size, self._handle, stream.handle)
382+
raise_if_driver_error(err)
366383
return Buffer._init(ptr, size, self)
367384

368-
def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None):
385+
def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
369386
"""Deallocate a buffer previously allocated by this resource.
370387
371388
Parameters
@@ -380,7 +397,8 @@ def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None):
380397
"""
381398
if stream is None:
382399
stream = default_stream()
383-
handle_return(driver.cuMemFreeAsync(ptr, stream.handle))
400+
err, = driver.cuMemFreeAsync(ptr, stream.handle)
401+
raise_if_driver_error(err)
384402

385403
@property
386404
def is_device_accessible(self) -> bool:
@@ -407,7 +425,7 @@ def __init__(self):
407425
# TODO: support flags from cuMemHostAlloc?
408426
self._handle = None
409427

410-
def allocate(self, size: int, stream: Stream = None) -> Buffer:
428+
def allocate(self, size_t size, stream: Stream = None) -> Buffer:
411429
"""Allocate a buffer of the requested size.
412430

413431
Parameters
@@ -422,10 +440,11 @@ def allocate(self, size: int, stream: Stream = None) -> Buffer:
422440
Buffer
423441
The allocated buffer object, which is accessible on both host and device.
424442
"""
425-
ptr = handle_return(driver.cuMemAllocHost(size))
443+
err, ptr = driver.cuMemAllocHost(size)
444+
raise_if_driver_error(err)
426445
return Buffer._init(ptr, size, self)
427446

428-
def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None):
447+
def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
429448
"""Deallocate a buffer previously allocated by this resource.
430449
431450
Parameters
@@ -440,7 +459,8 @@ def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None):
440459
"""
441460
if stream:
442461
stream.sync()
443-
handle_return(driver.cuMemFreeHost(ptr))
462+
err, = driver.cuMemFreeHost(ptr)
463+
raise_if_driver_error(err)
444464

445465
@property
446466
def is_device_accessible(self) -> bool:
@@ -466,14 +486,16 @@ def __init__(self, device_id):
466486
self._dev_id = device_id
467487

468488
def allocate(self, size, stream=None) -> Buffer:
469-
ptr = handle_return(driver.cuMemAlloc(size))
489+
err, ptr = driver.cuMemAlloc(size)
490+
raise_if_driver_error(err)
470491
return Buffer._init(ptr, size, self)
471492

472493
def deallocate(self, ptr, size, stream=None):
473494
if stream is None:
474495
stream = default_stream()
475496
stream.sync()
476-
handle_return(driver.cuMemFree(ptr))
497+
err, = driver.cuMemFree(ptr)
498+
raise_if_driver_error(err)
477499

478500
@property
479501
def is_device_accessible(self) -> bool:

cuda_core/docs/source/release/0.X.Y-notes.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Breaking Changes
1919
----------------
2020

2121
- **LaunchConfig grid parameter interpretation**: When :attr:`LaunchConfig.cluster` is specified, the :attr:`LaunchConfig.grid` parameter now correctly represents the number of clusters instead of blocks. Previously, the grid parameter was incorrectly interpreted as blocks, causing a mismatch with the expected C++ behavior. This change ensures that ``LaunchConfig(grid=4, cluster=2, block=32)`` correctly produces 4 clusters × 2 blocks/cluster = 8 total blocks, matching the C++ equivalent ``cudax::make_hierarchy(cudax::grid_dims(4), cudax::cluster_dims(2), cudax::block_dims(32))``.
22+
- When :class:`Buffer` is closed, :attr:`Buffer.handle` is now set to `None`. It was previously set to ``0`` by accident.
2223

2324

2425
New features
@@ -40,4 +41,5 @@ Fixes and enhancements
4041
- Improved :class:`DeviceMemoryResource` allocation performance when there are no active allocations by setting a higher release threshold (addresses issue #771).
4142
- Improved :class:`StridedMemoryView` creation time performance by optimizing shape and strides tuple creation using Python/C API (addresses issue #449).
4243
- Fix :class:`LaunchConfig` grid unit conversion when cluster is set (addresses issue #867).
43-
- Fixed a bug in :class:`GraphBuilder.add_child` where dependencies extracted from capturing stream were passed inconsistently with num_dependencies parameter (addresses issue #843).
44+
- Fixed a bug in :class:`GraphBuilder.add_child` where dependencies extracted from capturing stream were passed inconsistently with num_dependencies parameter (addresses issue #843).
45+
- Make :class:`Buffer` creation more performant.

cuda_core/examples/memory_ops.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,8 @@
129129
cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream
130130

131131
# Verify buffers are properly closed
132-
assert device_buffer.handle == 0, "Device buffer should be closed"
133-
assert pinned_buffer.handle == 0, "Pinned buffer should be closed"
134-
assert new_device_buffer.handle == 0, "New device buffer should be closed"
132+
assert device_buffer.handle is None, "Device buffer should be closed"
133+
assert pinned_buffer.handle is None, "Pinned buffer should be closed"
134+
assert new_device_buffer.handle is None, "New device buffer should be closed"
135135

136136
print("Memory management example completed!")

0 commit comments

Comments
 (0)