From 8c685beae784fa7803a5a8c646d9dc6be3892187 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 18 Aug 2025 09:55:05 -0400 Subject: [PATCH 1/8] Fix #449: Delay construction of Python attributes --- .../cuda/core/experimental/_memoryview.pyx | 125 +++++++++++------- cuda_core/tests/test_utils.py | 32 +++++ 2 files changed, 110 insertions(+), 47 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 31482229c..418967fa5 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -18,7 +18,6 @@ from cuda.core.experimental._utils cimport cuda_utils # TODO(leofang): support NumPy structured dtypes -@cython.dataclasses.dataclass cdef class StridedMemoryView: """A dataclass holding metadata of a strided dense array/tensor. @@ -51,7 +50,7 @@ cdef class StridedMemoryView: Pointer to the tensor buffer (as a Python `int`). shape : tuple Shape of the tensor. - strides : tuple + strides : Optional[tuple] Strides of the tensor (in **counts**, not bytes). dtype: numpy.dtype Data type of the tensor. @@ -70,19 +69,22 @@ cdef class StridedMemoryView: obj : Any Any objects that supports either DLPack (up to v1.0) or CUDA Array Interface (v3). - stream_ptr: int + stream_ptr: Optional[int] The pointer address (as Python `int`) to the **consumer** stream. Stream ordering will be properly established unless ``-1`` is passed. """ - # TODO: switch to use Cython's cdef typing? - ptr: int = None - shape: tuple = None - strides: tuple = None # in counts, not bytes - dtype: numpy.dtype = None - device_id: int = None # -1 for CPU - is_device_accessible: bool = None - readonly: bool = None - exporting_obj: Any = None + cdef readonly: + intptr_t ptr + int device_id + bint is_device_accessible + bint readonly + object exporting_obj + + # The tensor object if has obj has __dlpack__, otherwise must be NULL + cdef DLTensor *dl_tensor + # A strong reference to the result of obj.__dlpack__() so we + # can lazily create shape and strides from it later + cdef object dlpack_capsule def __init__(self, obj=None, stream_ptr=None): if obj is not None: @@ -92,9 +94,50 @@ cdef class StridedMemoryView: else: view_as_cai(obj, stream_ptr, self) else: - # default construct pass + @property + def shape(self) -> tuple[int]: + if self.exporting_obj is not None: + if self.dl_tensor != NULL: + return cuda_utils.carray_int64_t_to_tuple( + self.dl_tensor.shape, + self.dl_tensor.ndim + ) + else: + return self.exporting_obj.__cuda_array_interface__["shape"] + return () + + @property + def strides(self) -> Optional[tuple[int]]: + cdef int itemsize + if self.exporting_obj is not None: + if self.dl_tensor != NULL: + if self.dl_tensor.strides: + return cuda_utils.carray_int64_t_to_tuple( + self.dl_tensor.strides, + self.dl_tensor.ndim + ) + else: + strides = self.exporting_obj.__cuda_array_interface__.get("strides") + if strides is not None: + itemsize = self.dtype.itemsize + result = cpython.PyTuple_New(len(strides)) + for i in range(len(strides)): + cpython.PyTuple_SET_ITEM(result, i, strides[i] // itemsize) + return result + return None + + @property + def dtype(self) -> Optional[numpy.dtype]: + if self.exporting_obj is not None: + if self.dl_tensor != NULL: + return dtype_dlpack_to_numpy(&self.dl_tensor.dtype) + else: + # TODO: this only works for built-in numeric types + return numpy.dtype(self.exporting_obj.__cuda_array_interface__["typestr"]) + return None + def __repr__(self): return (f"StridedMemoryView(ptr={self.ptr},\n" + f" shape={self.shape},\n" @@ -152,7 +195,7 @@ cdef class _StridedMemoryViewProxy: cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): cdef int dldevice, device_id, i - cdef bint is_device_accessible, versioned, is_readonly + cdef bint is_device_accessible, is_readonly is_device_accessible = False dldevice, device_id = obj.__dlpack_device__() if dldevice == _kDLCPU: @@ -193,7 +236,6 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME): data = cpython.PyCapsule_GetPointer( capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME) - versioned = True dlm_tensor_ver = data dl_tensor = &dlm_tensor_ver.dl_tensor is_readonly = bool((dlm_tensor_ver.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0) @@ -202,7 +244,6 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): capsule, DLPACK_TENSOR_UNUSED_NAME): data = cpython.PyCapsule_GetPointer( capsule, DLPACK_TENSOR_UNUSED_NAME) - versioned = False dlm_tensor = data dl_tensor = &dlm_tensor.dl_tensor is_readonly = False @@ -210,24 +251,17 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): else: assert False + cpython.PyCapsule_SetName(capsule, used_name) + cdef StridedMemoryView buf = StridedMemoryView() if view is None else view + buf.dl_tensor = dl_tensor + buf.dlpack_capsule = capsule buf.ptr = (dl_tensor.data) - - buf.shape = cuda_utils.carray_int64_t_to_tuple(dl_tensor.shape, dl_tensor.ndim) - if dl_tensor.strides: - buf.strides = cuda_utils.carray_int64_t_to_tuple(dl_tensor.strides, dl_tensor.ndim) - else: - # C-order - buf.strides = None - - buf.dtype = dtype_dlpack_to_numpy(&dl_tensor.dtype) buf.device_id = device_id buf.is_device_accessible = is_device_accessible buf.readonly = is_readonly buf.exporting_obj = obj - cpython.PyCapsule_SetName(capsule, used_name) - return buf @@ -291,7 +325,8 @@ cdef object dtype_dlpack_to_numpy(DLDataType* dtype): return numpy.dtype(np_dtype) -cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): +# Also generate for Python so we can test this code path +cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): cdef dict cai_data = obj.__cuda_array_interface__ if cai_data["version"] < 3: raise BufferError("only CUDA Array Interface v3 or above is supported") @@ -302,14 +337,8 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): cdef StridedMemoryView buf = StridedMemoryView() if view is None else view buf.exporting_obj = obj + buf.dl_tensor = NULL buf.ptr, buf.readonly = cai_data["data"] - buf.shape = cai_data["shape"] - # TODO: this only works for built-in numeric types - buf.dtype = numpy.dtype(cai_data["typestr"]) - buf.strides = cai_data.get("strides") - if buf.strides is not None: - # convert to counts - buf.strides = tuple(s // buf.dtype.itemsize for s in buf.strides) buf.is_device_accessible = True buf.device_id = handle_return( driver.cuPointerGetAttribute( @@ -317,18 +346,20 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): buf.ptr)) cdef intptr_t producer_s, consumer_s - stream = cai_data.get("stream") - if stream is not None: - producer_s = (stream) - consumer_s = (stream_ptr) - assert producer_s > 0 - # establish stream order - if producer_s != consumer_s: - e = handle_return(driver.cuEventCreate( - driver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) - handle_return(driver.cuEventRecord(e, producer_s)) - handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0)) - handle_return(driver.cuEventDestroy(e)) + stream_ptr = int(stream_ptr) if stream_ptr is not None else -1 + if stream_ptr != -1: + stream = cai_data.get("stream") + if stream is not None: + producer_s = (stream) + consumer_s = (stream_ptr) + assert producer_s > 0 + # establish stream order + if producer_s != consumer_s: + e = handle_return(driver.cuEventCreate( + driver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) + handle_return(driver.cuEventRecord(e, producer_s)) + handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0)) + handle_return(driver.cuEventDestroy(e)) return buf diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index e35f2c7b0..7980da185 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -15,6 +15,7 @@ import cuda.core.experimental from cuda.core.experimental import Device +from cuda.core.experimental._memoryview import view_as_cai from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory @@ -164,3 +165,34 @@ def _check_view(self, view, in_arr, dev): assert view.is_device_accessible is True assert view.exporting_obj is in_arr # can't test view.readonly with CuPy or Numba... + + +@pytest.mark.skipif(cp is None, reason="CuPy is not installed") +@pytest.mark.parametrize("in_arr,use_stream", (*gpu_array_samples(),)) +class TestViewCudaArrayInterfaceGPU: + def test_cuda_array_interface_gpu(self, in_arr, use_stream): + # TODO: use the device fixture? + dev = Device() + dev.set_current() + # This is the consumer stream + s = dev.create_stream() if use_stream else None + + # The usual path in `StridedMemoryView` prefers the DLPack interface + # over __cuda_array_interface__, so we call `view_as_cai` directly + # here so we can test the CAI code path. + view = view_as_cai(in_arr, stream_ptr=s.handle if s else -1) + self._check_view(view, in_arr, dev) + + def _check_view(self, view, in_arr, dev): + assert isinstance(view, StridedMemoryView) + assert view.ptr == gpu_array_ptr(in_arr) + assert view.shape == in_arr.shape + strides_in_counts = convert_strides_to_counts(in_arr.strides, in_arr.dtype.itemsize) + if in_arr.flags["C_CONTIGUOUS"]: + assert view.strides is None + else: + assert view.strides == strides_in_counts + assert view.dtype == in_arr.dtype + assert view.device_id == dev.device_id + assert view.is_device_accessible is True + assert view.exporting_obj is in_arr From a0f16902904a1f593d388234f57c45aa9ed7128c Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 18 Aug 2025 10:02:36 -0400 Subject: [PATCH 2/8] Fix type of device_id --- cuda_core/cuda/core/experimental/_memoryview.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 418967fa5..a39da887b 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -75,7 +75,7 @@ cdef class StridedMemoryView: """ cdef readonly: intptr_t ptr - int device_id + intptr_t device_id bint is_device_accessible bint readonly object exporting_obj From 9a9a928eaa27f4aaf6c93476cdcfb7d5ae487199 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 18 Aug 2025 11:29:44 -0400 Subject: [PATCH 3/8] Update cuda_core/cuda/core/experimental/_memoryview.pyx Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- cuda_core/cuda/core/experimental/_memoryview.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index a39da887b..24b644e58 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -78,7 +78,7 @@ cdef class StridedMemoryView: intptr_t device_id bint is_device_accessible bint readonly - object exporting_obj + object exporting_obj # The tensor object if has obj has __dlpack__, otherwise must be NULL cdef DLTensor *dl_tensor From 06d4ab41cd7bbbb0605bbabf392f51d524788190 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 19 Aug 2025 11:40:24 -0400 Subject: [PATCH 4/8] Memoize the properties shape, strides, and dtype --- .../cuda/core/experimental/_memoryview.pyx | 43 +++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 24b644e58..2996d5894 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -69,7 +69,7 @@ cdef class StridedMemoryView: obj : Any Any objects that supports either DLPack (up to v1.0) or CUDA Array Interface (v3). - stream_ptr: Optional[int] + stream_ptr: int The pointer address (as Python `int`) to the **consumer** stream. Stream ordering will be properly established unless ``-1`` is passed. """ @@ -85,6 +85,11 @@ cdef class StridedMemoryView: # A strong reference to the result of obj.__dlpack__() so we # can lazily create shape and strides from it later cdef object dlpack_capsule + + # Memoized properties + cdef tuple _shape + cdef object _strides + cdef object _dtype def __init__(self, obj=None, stream_ptr=None): if obj is not None: @@ -98,23 +103,25 @@ cdef class StridedMemoryView: @property def shape(self) -> tuple[int]: - if self.exporting_obj is not None: + if self._shape is None and self.exporting_obj is not None: if self.dl_tensor != NULL: - return cuda_utils.carray_int64_t_to_tuple( + self._shape = cuda_utils.carray_int64_t_to_tuple( self.dl_tensor.shape, self.dl_tensor.ndim ) else: - return self.exporting_obj.__cuda_array_interface__["shape"] - return () + self._shape = self.exporting_obj.__cuda_array_interface__["shape"] + else: + self._shape = () + return self._shape @property def strides(self) -> Optional[tuple[int]]: cdef int itemsize - if self.exporting_obj is not None: + if self._strides is None and self.exporting_obj is not None: if self.dl_tensor != NULL: if self.dl_tensor.strides: - return cuda_utils.carray_int64_t_to_tuple( + self._strides = cuda_utils.carray_int64_t_to_tuple( self.dl_tensor.strides, self.dl_tensor.ndim ) @@ -122,21 +129,21 @@ cdef class StridedMemoryView: strides = self.exporting_obj.__cuda_array_interface__.get("strides") if strides is not None: itemsize = self.dtype.itemsize - result = cpython.PyTuple_New(len(strides)) + self._strides = cpython.PyTuple_New(len(strides)) for i in range(len(strides)): - cpython.PyTuple_SET_ITEM(result, i, strides[i] // itemsize) - return result - return None + cpython.PyTuple_SET_ITEM(self._strides, i, strides[i] // itemsize) + return self._strides @property def dtype(self) -> Optional[numpy.dtype]: - if self.exporting_obj is not None: - if self.dl_tensor != NULL: - return dtype_dlpack_to_numpy(&self.dl_tensor.dtype) - else: - # TODO: this only works for built-in numeric types - return numpy.dtype(self.exporting_obj.__cuda_array_interface__["typestr"]) - return None + if self._dtype is None: + if self.exporting_obj is not None: + if self.dl_tensor != NULL: + self._dtype = dtype_dlpack_to_numpy(&self.dl_tensor.dtype) + else: + # TODO: this only works for built-in numeric types + self._dtype = numpy.dtype(self.exporting_obj.__cuda_array_interface__["typestr"]) + return self._dtype def __repr__(self): return (f"StridedMemoryView(ptr={self.ptr},\n" From 5c1b7a92074021ff0565cb5d63700875656d8657 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 19 Aug 2025 11:41:50 -0400 Subject: [PATCH 5/8] We don't need to handle stream_ptr == None --- cuda_core/cuda/core/experimental/_memoryview.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 2996d5894..4b887c035 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -353,7 +353,7 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): buf.ptr)) cdef intptr_t producer_s, consumer_s - stream_ptr = int(stream_ptr) if stream_ptr is not None else -1 + stream_ptr = int(stream_ptr) if stream_ptr != -1: stream = cai_data.get("stream") if stream is not None: From d9270a151c2814268652f43701bfc6c03432b0ff Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 19 Aug 2025 11:55:51 -0400 Subject: [PATCH 6/8] device_id can be an int --- cuda_core/cuda/core/experimental/_memoryview.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 4b887c035..053d3a6ff 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -75,7 +75,7 @@ cdef class StridedMemoryView: """ cdef readonly: intptr_t ptr - intptr_t device_id + int device_id bint is_device_accessible bint readonly object exporting_obj From 0078990c29382b2284b24e98864a6998716d2924 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 19 Aug 2025 11:56:04 -0400 Subject: [PATCH 7/8] Also cache the cai_data --- .../cuda/core/experimental/_memoryview.pyx | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 053d3a6ff..584a9ba9e 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -80,11 +80,14 @@ cdef class StridedMemoryView: bint readonly object exporting_obj + # If using dlpack, this is a strong reference to the result of + # obj.__dlpack__() so we can lazily create shape and strides from + # it later. If using CAI, this is a reference to the source + # `__cuda_array_interface__` object. + cdef object metadata + # The tensor object if has obj has __dlpack__, otherwise must be NULL cdef DLTensor *dl_tensor - # A strong reference to the result of obj.__dlpack__() so we - # can lazily create shape and strides from it later - cdef object dlpack_capsule # Memoized properties cdef tuple _shape @@ -110,7 +113,7 @@ cdef class StridedMemoryView: self.dl_tensor.ndim ) else: - self._shape = self.exporting_obj.__cuda_array_interface__["shape"] + self._shape = self.metadata["shape"] else: self._shape = () return self._shape @@ -126,7 +129,7 @@ cdef class StridedMemoryView: self.dl_tensor.ndim ) else: - strides = self.exporting_obj.__cuda_array_interface__.get("strides") + strides = self.metadata.get("strides") if strides is not None: itemsize = self.dtype.itemsize self._strides = cpython.PyTuple_New(len(strides)) @@ -142,7 +145,7 @@ cdef class StridedMemoryView: self._dtype = dtype_dlpack_to_numpy(&self.dl_tensor.dtype) else: # TODO: this only works for built-in numeric types - self._dtype = numpy.dtype(self.exporting_obj.__cuda_array_interface__["typestr"]) + self._dtype = numpy.dtype(self.metadata["typestr"]) return self._dtype def __repr__(self): @@ -262,7 +265,7 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): cdef StridedMemoryView buf = StridedMemoryView() if view is None else view buf.dl_tensor = dl_tensor - buf.dlpack_capsule = capsule + buf.metadata = capsule buf.ptr = (dl_tensor.data) buf.device_id = device_id buf.is_device_accessible = is_device_accessible @@ -344,6 +347,7 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): cdef StridedMemoryView buf = StridedMemoryView() if view is None else view buf.exporting_obj = obj + buf.metadata = cai_data buf.dl_tensor = NULL buf.ptr, buf.readonly = cai_data["data"] buf.is_device_accessible = True From cf377509a767deafee86849d853053b11f59901f Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 21 Aug 2025 14:59:27 -0400 Subject: [PATCH 8/8] Don't recompute strides --- .../cuda/core/experimental/_memoryview.pyx | 35 +++++++++++-------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 584a9ba9e..9d2413305 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -91,7 +91,8 @@ cdef class StridedMemoryView: # Memoized properties cdef tuple _shape - cdef object _strides + cdef tuple _strides + cdef bint _strides_init # Has the strides tuple been init'ed? cdef object _dtype def __init__(self, obj=None, stream_ptr=None): @@ -121,20 +122,24 @@ cdef class StridedMemoryView: @property def strides(self) -> Optional[tuple[int]]: cdef int itemsize - if self._strides is None and self.exporting_obj is not None: - if self.dl_tensor != NULL: - if self.dl_tensor.strides: - self._strides = cuda_utils.carray_int64_t_to_tuple( - self.dl_tensor.strides, - self.dl_tensor.ndim - ) - else: - strides = self.metadata.get("strides") - if strides is not None: - itemsize = self.dtype.itemsize - self._strides = cpython.PyTuple_New(len(strides)) - for i in range(len(strides)): - cpython.PyTuple_SET_ITEM(self._strides, i, strides[i] // itemsize) + if self._strides_init is False: + if self.exporting_obj is not None: + if self.dl_tensor != NULL: + if self.dl_tensor.strides: + self._strides = cuda_utils.carray_int64_t_to_tuple( + self.dl_tensor.strides, + self.dl_tensor.ndim + ) + else: + strides = self.metadata.get("strides") + if strides is not None: + itemsize = self.dtype.itemsize + self._strides = cpython.PyTuple_New(len(strides)) + for i in range(len(strides)): + cpython.PyTuple_SET_ITEM( + self._strides, i, strides[i] // itemsize + ) + self._strides_init = True return self._strides @property