diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 31482229c..9d2413305 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -18,7 +18,6 @@ from cuda.core.experimental._utils cimport cuda_utils # TODO(leofang): support NumPy structured dtypes -@cython.dataclasses.dataclass cdef class StridedMemoryView: """A dataclass holding metadata of a strided dense array/tensor. @@ -51,7 +50,7 @@ cdef class StridedMemoryView: Pointer to the tensor buffer (as a Python `int`). shape : tuple Shape of the tensor. - strides : tuple + strides : Optional[tuple] Strides of the tensor (in **counts**, not bytes). dtype: numpy.dtype Data type of the tensor. @@ -74,15 +73,27 @@ cdef class StridedMemoryView: The pointer address (as Python `int`) to the **consumer** stream. Stream ordering will be properly established unless ``-1`` is passed. """ - # TODO: switch to use Cython's cdef typing? - ptr: int = None - shape: tuple = None - strides: tuple = None # in counts, not bytes - dtype: numpy.dtype = None - device_id: int = None # -1 for CPU - is_device_accessible: bool = None - readonly: bool = None - exporting_obj: Any = None + cdef readonly: + intptr_t ptr + int device_id + bint is_device_accessible + bint readonly + object exporting_obj + + # If using dlpack, this is a strong reference to the result of + # obj.__dlpack__() so we can lazily create shape and strides from + # it later. If using CAI, this is a reference to the source + # `__cuda_array_interface__` object. + cdef object metadata + + # The tensor object if has obj has __dlpack__, otherwise must be NULL + cdef DLTensor *dl_tensor + + # Memoized properties + cdef tuple _shape + cdef tuple _strides + cdef bint _strides_init # Has the strides tuple been init'ed? + cdef object _dtype def __init__(self, obj=None, stream_ptr=None): if obj is not None: @@ -92,9 +103,56 @@ cdef class StridedMemoryView: else: view_as_cai(obj, stream_ptr, self) else: - # default construct pass + @property + def shape(self) -> tuple[int]: + if self._shape is None and self.exporting_obj is not None: + if self.dl_tensor != NULL: + self._shape = cuda_utils.carray_int64_t_to_tuple( + self.dl_tensor.shape, + self.dl_tensor.ndim + ) + else: + self._shape = self.metadata["shape"] + else: + self._shape = () + return self._shape + + @property + def strides(self) -> Optional[tuple[int]]: + cdef int itemsize + if self._strides_init is False: + if self.exporting_obj is not None: + if self.dl_tensor != NULL: + if self.dl_tensor.strides: + self._strides = cuda_utils.carray_int64_t_to_tuple( + self.dl_tensor.strides, + self.dl_tensor.ndim + ) + else: + strides = self.metadata.get("strides") + if strides is not None: + itemsize = self.dtype.itemsize + self._strides = cpython.PyTuple_New(len(strides)) + for i in range(len(strides)): + cpython.PyTuple_SET_ITEM( + self._strides, i, strides[i] // itemsize + ) + self._strides_init = True + return self._strides + + @property + def dtype(self) -> Optional[numpy.dtype]: + if self._dtype is None: + if self.exporting_obj is not None: + if self.dl_tensor != NULL: + self._dtype = dtype_dlpack_to_numpy(&self.dl_tensor.dtype) + else: + # TODO: this only works for built-in numeric types + self._dtype = numpy.dtype(self.metadata["typestr"]) + return self._dtype + def __repr__(self): return (f"StridedMemoryView(ptr={self.ptr},\n" + f" shape={self.shape},\n" @@ -152,7 +210,7 @@ cdef class _StridedMemoryViewProxy: cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): cdef int dldevice, device_id, i - cdef bint is_device_accessible, versioned, is_readonly + cdef bint is_device_accessible, is_readonly is_device_accessible = False dldevice, device_id = obj.__dlpack_device__() if dldevice == _kDLCPU: @@ -193,7 +251,6 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME): data = cpython.PyCapsule_GetPointer( capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME) - versioned = True dlm_tensor_ver = data dl_tensor = &dlm_tensor_ver.dl_tensor is_readonly = bool((dlm_tensor_ver.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0) @@ -202,7 +259,6 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): capsule, DLPACK_TENSOR_UNUSED_NAME): data = cpython.PyCapsule_GetPointer( capsule, DLPACK_TENSOR_UNUSED_NAME) - versioned = False dlm_tensor = data dl_tensor = &dlm_tensor.dl_tensor is_readonly = False @@ -210,24 +266,17 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): else: assert False + cpython.PyCapsule_SetName(capsule, used_name) + cdef StridedMemoryView buf = StridedMemoryView() if view is None else view + buf.dl_tensor = dl_tensor + buf.metadata = capsule buf.ptr = (dl_tensor.data) - - buf.shape = cuda_utils.carray_int64_t_to_tuple(dl_tensor.shape, dl_tensor.ndim) - if dl_tensor.strides: - buf.strides = cuda_utils.carray_int64_t_to_tuple(dl_tensor.strides, dl_tensor.ndim) - else: - # C-order - buf.strides = None - - buf.dtype = dtype_dlpack_to_numpy(&dl_tensor.dtype) buf.device_id = device_id buf.is_device_accessible = is_device_accessible buf.readonly = is_readonly buf.exporting_obj = obj - cpython.PyCapsule_SetName(capsule, used_name) - return buf @@ -291,7 +340,8 @@ cdef object dtype_dlpack_to_numpy(DLDataType* dtype): return numpy.dtype(np_dtype) -cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): +# Also generate for Python so we can test this code path +cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): cdef dict cai_data = obj.__cuda_array_interface__ if cai_data["version"] < 3: raise BufferError("only CUDA Array Interface v3 or above is supported") @@ -302,14 +352,9 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): cdef StridedMemoryView buf = StridedMemoryView() if view is None else view buf.exporting_obj = obj + buf.metadata = cai_data + buf.dl_tensor = NULL buf.ptr, buf.readonly = cai_data["data"] - buf.shape = cai_data["shape"] - # TODO: this only works for built-in numeric types - buf.dtype = numpy.dtype(cai_data["typestr"]) - buf.strides = cai_data.get("strides") - if buf.strides is not None: - # convert to counts - buf.strides = tuple(s // buf.dtype.itemsize for s in buf.strides) buf.is_device_accessible = True buf.device_id = handle_return( driver.cuPointerGetAttribute( @@ -317,18 +362,20 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): buf.ptr)) cdef intptr_t producer_s, consumer_s - stream = cai_data.get("stream") - if stream is not None: - producer_s = (stream) - consumer_s = (stream_ptr) - assert producer_s > 0 - # establish stream order - if producer_s != consumer_s: - e = handle_return(driver.cuEventCreate( - driver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) - handle_return(driver.cuEventRecord(e, producer_s)) - handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0)) - handle_return(driver.cuEventDestroy(e)) + stream_ptr = int(stream_ptr) + if stream_ptr != -1: + stream = cai_data.get("stream") + if stream is not None: + producer_s = (stream) + consumer_s = (stream_ptr) + assert producer_s > 0 + # establish stream order + if producer_s != consumer_s: + e = handle_return(driver.cuEventCreate( + driver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) + handle_return(driver.cuEventRecord(e, producer_s)) + handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0)) + handle_return(driver.cuEventDestroy(e)) return buf diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index e35f2c7b0..7980da185 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -15,6 +15,7 @@ import cuda.core.experimental from cuda.core.experimental import Device +from cuda.core.experimental._memoryview import view_as_cai from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory @@ -164,3 +165,34 @@ def _check_view(self, view, in_arr, dev): assert view.is_device_accessible is True assert view.exporting_obj is in_arr # can't test view.readonly with CuPy or Numba... + + +@pytest.mark.skipif(cp is None, reason="CuPy is not installed") +@pytest.mark.parametrize("in_arr,use_stream", (*gpu_array_samples(),)) +class TestViewCudaArrayInterfaceGPU: + def test_cuda_array_interface_gpu(self, in_arr, use_stream): + # TODO: use the device fixture? + dev = Device() + dev.set_current() + # This is the consumer stream + s = dev.create_stream() if use_stream else None + + # The usual path in `StridedMemoryView` prefers the DLPack interface + # over __cuda_array_interface__, so we call `view_as_cai` directly + # here so we can test the CAI code path. + view = view_as_cai(in_arr, stream_ptr=s.handle if s else -1) + self._check_view(view, in_arr, dev) + + def _check_view(self, view, in_arr, dev): + assert isinstance(view, StridedMemoryView) + assert view.ptr == gpu_array_ptr(in_arr) + assert view.shape == in_arr.shape + strides_in_counts = convert_strides_to_counts(in_arr.strides, in_arr.dtype.itemsize) + if in_arr.flags["C_CONTIGUOUS"]: + assert view.strides is None + else: + assert view.strides == strides_in_counts + assert view.dtype == in_arr.dtype + assert view.device_id == dev.device_id + assert view.is_device_accessible is True + assert view.exporting_obj is in_arr