Skip to content

Fix #449: Delay construction of Python attributes #847

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
139 changes: 93 additions & 46 deletions cuda_core/cuda/core/experimental/_memoryview.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ from cuda.core.experimental._utils cimport cuda_utils
# TODO(leofang): support NumPy structured dtypes


@cython.dataclasses.dataclass
cdef class StridedMemoryView:
"""A dataclass holding metadata of a strided dense array/tensor.

Expand Down Expand Up @@ -51,7 +50,7 @@ cdef class StridedMemoryView:
Pointer to the tensor buffer (as a Python `int`).
shape : tuple
Shape of the tensor.
strides : tuple
strides : Optional[tuple]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just updating docs to match current behavior (not a change in behavior)

Strides of the tensor (in **counts**, not bytes).
dtype: numpy.dtype
Data type of the tensor.
Expand All @@ -74,15 +73,27 @@ cdef class StridedMemoryView:
The pointer address (as Python `int`) to the **consumer** stream.
Stream ordering will be properly established unless ``-1`` is passed.
"""
# TODO: switch to use Cython's cdef typing?
ptr: int = None
shape: tuple = None
strides: tuple = None # in counts, not bytes
dtype: numpy.dtype = None
device_id: int = None # -1 for CPU
is_device_accessible: bool = None
readonly: bool = None
exporting_obj: Any = None
cdef readonly:
intptr_t ptr
int device_id
bint is_device_accessible
bint readonly
object exporting_obj

# If using dlpack, this is a strong reference to the result of
# obj.__dlpack__() so we can lazily create shape and strides from
# it later. If using CAI, this is a reference to the source
# `__cuda_array_interface__` object.
cdef object metadata

# The tensor object if has obj has __dlpack__, otherwise must be NULL
cdef DLTensor *dl_tensor

# Memoized properties
cdef tuple _shape
cdef tuple _strides
cdef bint _strides_init # Has the strides tuple been init'ed?
cdef object _dtype

def __init__(self, obj=None, stream_ptr=None):
if obj is not None:
Expand All @@ -92,9 +103,56 @@ cdef class StridedMemoryView:
else:
view_as_cai(obj, stream_ptr, self)
else:
# default construct
pass

@property
def shape(self) -> tuple[int]:
if self._shape is None and self.exporting_obj is not None:
if self.dl_tensor != NULL:
self._shape = cuda_utils.carray_int64_t_to_tuple(
self.dl_tensor.shape,
self.dl_tensor.ndim
)
else:
self._shape = self.metadata["shape"]
else:
self._shape = ()
return self._shape

@property
def strides(self) -> Optional[tuple[int]]:
cdef int itemsize
if self._strides_init is False:
if self.exporting_obj is not None:
if self.dl_tensor != NULL:
if self.dl_tensor.strides:
self._strides = cuda_utils.carray_int64_t_to_tuple(
self.dl_tensor.strides,
self.dl_tensor.ndim
)
else:
strides = self.metadata.get("strides")
if strides is not None:
itemsize = self.dtype.itemsize
self._strides = cpython.PyTuple_New(len(strides))
for i in range(len(strides)):
cpython.PyTuple_SET_ITEM(
self._strides, i, strides[i] // itemsize
)
self._strides_init = True
return self._strides

@property
def dtype(self) -> Optional[numpy.dtype]:
if self._dtype is None:
if self.exporting_obj is not None:
if self.dl_tensor != NULL:
self._dtype = dtype_dlpack_to_numpy(&self.dl_tensor.dtype)
else:
# TODO: this only works for built-in numeric types
self._dtype = numpy.dtype(self.metadata["typestr"])
return self._dtype

def __repr__(self):
return (f"StridedMemoryView(ptr={self.ptr},\n"
+ f" shape={self.shape},\n"
Expand Down Expand Up @@ -152,7 +210,7 @@ cdef class _StridedMemoryViewProxy:

cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
cdef int dldevice, device_id, i
cdef bint is_device_accessible, versioned, is_readonly
cdef bint is_device_accessible, is_readonly
is_device_accessible = False
dldevice, device_id = obj.__dlpack_device__()
if dldevice == _kDLCPU:
Expand Down Expand Up @@ -193,7 +251,6 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME):
data = cpython.PyCapsule_GetPointer(
capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME)
versioned = True
dlm_tensor_ver = <DLManagedTensorVersioned*>data
dl_tensor = &dlm_tensor_ver.dl_tensor
is_readonly = bool((dlm_tensor_ver.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0)
Expand All @@ -202,32 +259,24 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
capsule, DLPACK_TENSOR_UNUSED_NAME):
data = cpython.PyCapsule_GetPointer(
capsule, DLPACK_TENSOR_UNUSED_NAME)
versioned = False
dlm_tensor = <DLManagedTensor*>data
dl_tensor = &dlm_tensor.dl_tensor
is_readonly = False
used_name = DLPACK_TENSOR_USED_NAME
else:
assert False

cpython.PyCapsule_SetName(capsule, used_name)

cdef StridedMemoryView buf = StridedMemoryView() if view is None else view
buf.dl_tensor = dl_tensor
buf.metadata = capsule
buf.ptr = <intptr_t>(dl_tensor.data)

buf.shape = cuda_utils.carray_int64_t_to_tuple(dl_tensor.shape, dl_tensor.ndim)
if dl_tensor.strides:
buf.strides = cuda_utils.carray_int64_t_to_tuple(dl_tensor.strides, dl_tensor.ndim)
else:
# C-order
buf.strides = None

buf.dtype = dtype_dlpack_to_numpy(&dl_tensor.dtype)
buf.device_id = device_id
buf.is_device_accessible = is_device_accessible
buf.readonly = is_readonly
buf.exporting_obj = obj

cpython.PyCapsule_SetName(capsule, used_name)

return buf


Expand Down Expand Up @@ -291,7 +340,8 @@ cdef object dtype_dlpack_to_numpy(DLDataType* dtype):
return numpy.dtype(np_dtype)


cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
# Also generate for Python so we can test this code path
cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
cdef dict cai_data = obj.__cuda_array_interface__
if cai_data["version"] < 3:
raise BufferError("only CUDA Array Interface v3 or above is supported")
Expand All @@ -302,33 +352,30 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):

cdef StridedMemoryView buf = StridedMemoryView() if view is None else view
buf.exporting_obj = obj
buf.metadata = cai_data
buf.dl_tensor = NULL
buf.ptr, buf.readonly = cai_data["data"]
buf.shape = cai_data["shape"]
# TODO: this only works for built-in numeric types
buf.dtype = numpy.dtype(cai_data["typestr"])
buf.strides = cai_data.get("strides")
if buf.strides is not None:
# convert to counts
buf.strides = tuple(s // buf.dtype.itemsize for s in buf.strides)
buf.is_device_accessible = True
buf.device_id = handle_return(
driver.cuPointerGetAttribute(
driver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
buf.ptr))

cdef intptr_t producer_s, consumer_s
stream = cai_data.get("stream")
if stream is not None:
producer_s = <intptr_t>(stream)
consumer_s = <intptr_t>(stream_ptr)
assert producer_s > 0
# establish stream order
if producer_s != consumer_s:
e = handle_return(driver.cuEventCreate(
driver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
handle_return(driver.cuEventRecord(e, producer_s))
handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0))
handle_return(driver.cuEventDestroy(e))
stream_ptr = int(stream_ptr)
if stream_ptr != -1:
stream = cai_data.get("stream")
if stream is not None:
producer_s = <intptr_t>(stream)
consumer_s = <intptr_t>(stream_ptr)
assert producer_s > 0
# establish stream order
if producer_s != consumer_s:
e = handle_return(driver.cuEventCreate(
driver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
handle_return(driver.cuEventRecord(e, producer_s))
handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0))
handle_return(driver.cuEventDestroy(e))

return buf

Expand Down
32 changes: 32 additions & 0 deletions cuda_core/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import cuda.core.experimental
from cuda.core.experimental import Device
from cuda.core.experimental._memoryview import view_as_cai
from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory


Expand Down Expand Up @@ -164,3 +165,34 @@ def _check_view(self, view, in_arr, dev):
assert view.is_device_accessible is True
assert view.exporting_obj is in_arr
# can't test view.readonly with CuPy or Numba...


@pytest.mark.skipif(cp is None, reason="CuPy is not installed")
@pytest.mark.parametrize("in_arr,use_stream", (*gpu_array_samples(),))
class TestViewCudaArrayInterfaceGPU:
def test_cuda_array_interface_gpu(self, in_arr, use_stream):
# TODO: use the device fixture?
dev = Device()
dev.set_current()
# This is the consumer stream
s = dev.create_stream() if use_stream else None

# The usual path in `StridedMemoryView` prefers the DLPack interface
# over __cuda_array_interface__, so we call `view_as_cai` directly
# here so we can test the CAI code path.
view = view_as_cai(in_arr, stream_ptr=s.handle if s else -1)
self._check_view(view, in_arr, dev)

def _check_view(self, view, in_arr, dev):
assert isinstance(view, StridedMemoryView)
assert view.ptr == gpu_array_ptr(in_arr)
assert view.shape == in_arr.shape
strides_in_counts = convert_strides_to_counts(in_arr.strides, in_arr.dtype.itemsize)
if in_arr.flags["C_CONTIGUOUS"]:
assert view.strides is None
else:
assert view.strides == strides_in_counts
assert view.dtype == in_arr.dtype
assert view.device_id == dev.device_id
assert view.is_device_accessible is True
assert view.exporting_obj is in_arr