From 07a48d7ff4e729e739003ef43b1a3d2225469e99 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 15 Nov 2024 11:32:03 -0800 Subject: [PATCH 1/7] Systematically replace `__del__` with `weakref.finalize()` --- cuda_core/cuda/core/experimental/_event.py | 15 +++++------ cuda_core/cuda/core/experimental/_memory.py | 8 +++--- cuda_core/cuda/core/experimental/_module.py | 7 +++-- cuda_core/cuda/core/experimental/_program.py | 12 ++++----- cuda_core/cuda/core/experimental/_stream.py | 27 +++++++++----------- 5 files changed, 33 insertions(+), 36 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index a6d5da281..0ee49ded4 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from typing import Optional +import weakref from cuda import cuda from cuda.core.experimental._utils import check_or_create_options @@ -50,19 +51,21 @@ class Event: and they should instead be created through a :obj:`Stream` object. """ - __slots__ = ("_handle", "_timing_disabled", "_busy_waited") + __slots__ = ("__weakref__", "_handle", "_timing_disabled", "_busy_waited") def __init__(self): - self._handle = None raise NotImplementedError( "directly creating an Event object can be ambiguous. Please call " "call Stream.record().") + def _enable_finalize(self): + self._handle = None + weakref.finalize(self, self.close) + @staticmethod def _init(options: Optional[EventOptions]=None): self = Event.__new__(Event) - # minimal requirements for the destructor - self._handle = None + self._enable_finalize() options = check_or_create_options(EventOptions, options, "Event options") flags = 0x0 @@ -79,10 +82,6 @@ def _init(options: Optional[EventOptions]=None): self._handle = handle_return(cuda.cuEventCreate(flags)) return self - def __del__(self): - """Return close(self)""" - self.close() - def close(self): """Destroy the event.""" if self._handle: diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 678f26ee8..dbed08301 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -6,6 +6,7 @@ import abc from typing import Optional, Tuple, TypeVar +import weakref import warnings from cuda import cuda @@ -44,16 +45,13 @@ class Buffer: """ # TODO: handle ownership? (_mr could be None) - __slots__ = ("_ptr", "_size", "_mr",) + __slots__ = ("__weakref__", "_ptr", "_size", "_mr",) def __init__(self, ptr, size, mr: MemoryResource=None): self._ptr = ptr self._size = size self._mr = mr - - def __del__(self): - """Return close(self).""" - self.close() + weakref.finalize(self, self.close) def close(self, stream=None): """Deallocate this buffer asynchronously on the given stream. diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 7621b9ee4..2a5d73701 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import importlib.metadata +import weakref from cuda import cuda, cudart from cuda.core.experimental._utils import handle_return @@ -104,7 +105,8 @@ class ObjectCode: """ - __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map") + __slots__ = ("__weakref__", "_handle", "_code_type", "_module", "_loader", + "_sym_map") _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin") def __init__(self, module, code_type, jit_options=None, *, @@ -113,6 +115,7 @@ def __init__(self, module, code_type, jit_options=None, *, raise ValueError _lazy_init() self._handle = None + weakref.finalize(self, self.close) backend = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old" self._loader = _backend[backend] @@ -140,7 +143,7 @@ def __init__(self, module, code_type, jit_options=None, *, self._module = module self._sym_map = {} if symbol_mapping is None else symbol_mapping - def __del__(self): + def close(self): # TODO: do we want to unload? Probably not.. pass diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 75b7313f4..d640717ce 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +import weakref + from cuda import nvrtc from cuda.core.experimental._utils import handle_return from cuda.core.experimental._module import ObjectCode @@ -24,15 +26,17 @@ class Program: """ - __slots__ = ("_handle", "_backend", ) + __slots__ = ("__weakref__", "_handle", "_backend", ) _supported_code_type = ("c++", ) _supported_target_type = ("ptx", "cubin", "ltoir") def __init__(self, code, code_type): - self._handle = None if code_type not in self._supported_code_type: raise NotImplementedError + self._handle = None + weakref.finalize(self, self.close) + if code_type.lower() == "c++": if not isinstance(code, str): raise TypeError @@ -44,10 +48,6 @@ def __init__(self, code, code_type): else: raise NotImplementedError - def __del__(self): - """Return close(self).""" - self.close() - def close(self): """Destroy this program.""" if self._handle is not None: diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index 7f50dafdb..abb49fcac 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -7,6 +7,7 @@ from dataclasses import dataclass import os from typing import Optional, Tuple, TYPE_CHECKING, Union +import weakref if TYPE_CHECKING: from cuda.core.experimental._device import Device @@ -53,27 +54,25 @@ class Stream: """ - __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin", - "_device_id", "_ctx_handle") + __slots__ = ("__weakref__", "_handle", "_nonblocking", "_priority", + "_owner", "_builtin", "_device_id", "_ctx_handle") def __init__(self): - # minimal requirements for the destructor - self._handle = None - self._owner = None - self._builtin = False raise NotImplementedError( "directly creating a Stream object can be ambiguous. Please either " "call Device.create_stream() or, if a stream pointer is already " "available from somewhere else, Stream.from_handle()") - @staticmethod - def _init(obj=None, *, options: Optional[StreamOptions]=None): - self = Stream.__new__(Stream) - - # minimal requirements for the destructor + def _enable_finalize(self): self._handle = None self._owner = None self._builtin = False + weakref.finalize(self, self.close) + + @staticmethod + def _init(obj=None, *, options: Optional[StreamOptions]=None): + self = Stream.__new__(Stream) + self._enable_finalize() if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") @@ -118,10 +117,6 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None): self._ctx_handle = None # delayed return self - def __del__(self): - """Return close(self).""" - self.close() - def close(self): """Destroy the stream. @@ -295,6 +290,7 @@ def __cuda_stream__(self): class _LegacyDefaultStream(Stream): def __init__(self): + self._enable_finalize() self._handle = cuda.CUstream(cuda.CU_STREAM_LEGACY) self._owner = None self._nonblocking = None # delayed @@ -305,6 +301,7 @@ def __init__(self): class _PerThreadDefaultStream(Stream): def __init__(self): + self._enable_finalize() self._handle = cuda.CUstream(cuda.CU_STREAM_PER_THREAD) self._owner = None self._nonblocking = None # delayed From 74d2859275a6fd15fb8546b0f077d3d90dce8f80 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 26 Nov 2024 21:55:39 -0800 Subject: [PATCH 2/7] Event._finalize() approach with self._finalizer.Detach() --- cuda_core/cuda/core/experimental/_event.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index 93eae43bd..7b0021113 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -51,21 +51,19 @@ class Event: """ - __slots__ = ("__weakref__", "_handle", "_timing_disabled", "_busy_waited") + __slots__ = ("__weakref__", "_finalizer", "_handle", "_timing_disabled", "_busy_waited") def __init__(self): + self._handle = None raise NotImplementedError( "directly creating an Event object can be ambiguous. Please call call Stream.record()." ) - def _enable_finalize(self): - self._handle = None - weakref.finalize(self, self.close) - @staticmethod def _init(options: Optional[EventOptions] = None): self = Event.__new__(Event) - self._enable_finalize() + # minimal requirements for the destructor + self._handle = None options = check_or_create_options(EventOptions, options, "Event options") flags = 0x0 @@ -80,12 +78,18 @@ def _init(options: Optional[EventOptions] = None): if options.support_ipc: raise NotImplementedError("TODO") self._handle = handle_return(cuda.cuEventCreate(flags)) + self._finalizer = weakref.finalize(self, Event._finalize, self._handle) return self + @staticmethod + def _finalize(self_handle): + handle_return(cuda.cuEventDestroy(self_handle)) + def close(self): """Destroy the event.""" if self._handle: - handle_return(cuda.cuEventDestroy(self._handle)) + self._finalizer.Detach() + Event._finalize(self._handle) self._handle = None @property From 9843fa5b6a3349f770f73fc5d068b09a1bba3db0 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Wed, 27 Nov 2024 11:35:46 -0800 Subject: [PATCH 3/7] Stream._MembersNeededForFinalize() approach. Corresponding demonstration of finalize behavior (immediate cleanup): https://github.com/rwgk/stuff/blob/f6fbd670b8376003c7767c96538d8ab0b1f49d96/random_attic/weakref_finalize_toy_example.py --- cuda_core/cuda/core/experimental/_launcher.py | 2 +- cuda_core/cuda/core/experimental/_memory.py | 8 +- cuda_core/cuda/core/experimental/_stream.py | 79 ++++++++----------- 3 files changed, 40 insertions(+), 49 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index 77af6b7e1..55af5e30f 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -131,7 +131,7 @@ def launch(kernel, config, *kernel_args): drv_cfg = cuda.CUlaunchConfig() drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block - drv_cfg.hStream = config.stream._handle + drv_cfg.hStream = config.stream.handle drv_cfg.sharedMemBytes = config.shmem_size drv_cfg.numAttrs = 0 # TODO handle_return(cuda.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0)) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index b5372fa44..3bf0377e4 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -132,7 +132,7 @@ def copy_to(self, dst: Buffer = None, *, stream) -> Buffer: dst = self._mr.allocate(self._size, stream) if dst._size != self._size: raise ValueError("buffer sizes mismatch between src and dst") - handle_return(cuda.cuMemcpyAsync(dst._ptr, self._ptr, self._size, stream._handle)) + handle_return(cuda.cuMemcpyAsync(dst._ptr, self._ptr, self._size, stream.handle)) return dst def copy_from(self, src: Buffer, *, stream): @@ -151,7 +151,7 @@ def copy_from(self, src: Buffer, *, stream): raise ValueError("stream must be provided") if src._size != self._size: raise ValueError("buffer sizes mismatch between src and dst") - handle_return(cuda.cuMemcpyAsync(self._ptr, src._ptr, self._size, stream._handle)) + handle_return(cuda.cuMemcpyAsync(self._ptr, src._ptr, self._size, stream.handle)) def __dlpack__( self, @@ -240,13 +240,13 @@ def __init__(self, dev_id): def allocate(self, size, stream=None) -> Buffer: if stream is None: stream = default_stream() - ptr = handle_return(cuda.cuMemAllocFromPoolAsync(size, self._handle, stream._handle)) + ptr = handle_return(cuda.cuMemAllocFromPoolAsync(size, self._handle, stream.handle)) return Buffer(ptr, size, self) def deallocate(self, ptr, size, stream=None): if stream is None: stream = default_stream() - handle_return(cuda.cuMemFreeAsync(ptr, stream._handle)) + handle_return(cuda.cuMemFreeAsync(ptr, stream.handle)) @property def is_device_accessible(self) -> bool: diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index 830c85963..22364d1a8 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -53,16 +53,24 @@ class Stream: """ - __slots__ = ( - "__weakref__", - "_handle", - "_nonblocking", - "_priority", - "_owner", - "_builtin", - "_device_id", - "_ctx_handle", - ) + class _MembersNeededForFinalize: + __slots__ = ("handle", "owner", "builtin") + + def __init__(self, stream_obj, handle, owner, builtin): + self.handle = handle + self.owner = owner + self.builtin = builtin + weakref.finalize(stream_obj, self.close) + + def close(self): + if self.owner is None: + if self.handle and not self.builtin: + handle_return(cuda.cuStreamDestroy(self.handle)) + else: + self.owner = None + self.handle = None + + __slots__ = ("__weakref__", "_mnff", "_nonblocking", "_priority", "_device_id", "_ctx_handle") def __init__(self): raise NotImplementedError( @@ -71,16 +79,10 @@ def __init__(self): "available from somewhere else, Stream.from_handle()" ) - def _enable_finalize(self): - self._handle = None - self._owner = None - self._builtin = False - weakref.finalize(self, self.close) - @staticmethod def _init(obj=None, *, options: Optional[StreamOptions] = None): self = Stream.__new__(Stream) - self._enable_finalize() + self._mnff = Stream._MembersNeededForFinalize(self, None, None, False) if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") @@ -89,9 +91,9 @@ def _init(obj=None, *, options: Optional[StreamOptions] = None): raise ValueError info = obj.__cuda_stream__ assert info[0] == 0 - self._handle = cuda.CUstream(info[1]) + self._mnff.handle = cuda.CUstream(info[1]) # TODO: check if obj is created under the current context/device - self._owner = obj + self._mnff.owner = obj self._nonblocking = None # delayed self._priority = None # delayed self._device_id = None # delayed @@ -111,8 +113,8 @@ def _init(obj=None, *, options: Optional[StreamOptions] = None): else: priority = high - self._handle = handle_return(cuda.cuStreamCreateWithPriority(flags, priority)) - self._owner = None + self._mnff.handle = handle_return(cuda.cuStreamCreateWithPriority(flags, priority)) + self._mnff.owner = None self._nonblocking = nonblocking self._priority = priority # don't defer this because we will have to pay a cost for context @@ -128,28 +130,23 @@ def close(self): object will instead have their references released. """ - if self._owner is None: - if self._handle and not self._builtin: - handle_return(cuda.cuStreamDestroy(self._handle)) - else: - self._owner = None - self._handle = None + self._mnff.close() @property def __cuda_stream__(self) -> Tuple[int, int]: """Return an instance of a __cuda_stream__ protocol.""" - return (0, int(self._handle)) + return (0, self.handle) @property def handle(self) -> int: """Return the underlying cudaStream_t pointer address as Python int.""" - return int(self._handle) + return int(self._mnff.handle) @property def is_nonblocking(self) -> bool: """Return True if this is a nonblocking stream, otherwise False.""" if self._nonblocking is None: - flag = handle_return(cuda.cuStreamGetFlags(self._handle)) + flag = handle_return(cuda.cuStreamGetFlags(self._mnff.handle)) if flag == cuda.CUstream_flags.CU_STREAM_NON_BLOCKING: self._nonblocking = True else: @@ -160,13 +157,13 @@ def is_nonblocking(self) -> bool: def priority(self) -> int: """Return the stream priority.""" if self._priority is None: - prio = handle_return(cuda.cuStreamGetPriority(self._handle)) + prio = handle_return(cuda.cuStreamGetPriority(self._mnff.handle)) self._priority = prio return self._priority def sync(self): """Synchronize the stream.""" - handle_return(cuda.cuStreamSynchronize(self._handle)) + handle_return(cuda.cuStreamSynchronize(self._mnff.handle)) def record(self, event: Event = None, options: EventOptions = None) -> Event: """Record an event onto the stream. @@ -194,7 +191,7 @@ def record(self, event: Event = None, options: EventOptions = None) -> Event: event = Event._init(options) elif not isinstance(event, Event): raise TypeError("record only takes an Event object") - handle_return(cuda.cuEventRecord(event.handle, self._handle)) + handle_return(cuda.cuEventRecord(event.handle, self._mnff.handle)) return event def wait(self, event_or_stream: Union[Event, Stream]): @@ -223,7 +220,7 @@ def wait(self, event_or_stream: Union[Event, Stream]): discard_event = True # TODO: support flags other than 0? - handle_return(cuda.cuStreamWaitEvent(self._handle, event, 0)) + handle_return(cuda.cuStreamWaitEvent(self._mnff.handle, event, 0)) if discard_event: handle_return(cuda.cuEventDestroy(event)) @@ -243,7 +240,7 @@ def device(self) -> Device: if self._device_id is None: # Get the stream context first if self._ctx_handle is None: - self._ctx_handle = handle_return(cuda.cuStreamGetCtx(self._handle)) + self._ctx_handle = handle_return(cuda.cuStreamGetCtx(self._mnff.handle)) self._device_id = get_device_from_ctx(self._ctx_handle) return Device(self._device_id) @@ -251,7 +248,7 @@ def device(self) -> Device: def context(self) -> Context: """Return the :obj:`Context` associated with this stream.""" if self._ctx_handle is None: - self._ctx_handle = handle_return(cuda.cuStreamGetCtx(self._handle)) + self._ctx_handle = handle_return(cuda.cuStreamGetCtx(self._mnff.handle)) if self._device_id is None: self._device_id = get_device_from_ctx(self._ctx_handle) return Context._from_ctx(self._ctx_handle, self._device_id) @@ -291,22 +288,16 @@ def __cuda_stream__(self): class _LegacyDefaultStream(Stream): def __init__(self): - self._enable_finalize() - self._handle = cuda.CUstream(cuda.CU_STREAM_LEGACY) - self._owner = None + self._mnff = Stream._MembersNeededForFinalize(self, cuda.CUstream(cuda.CU_STREAM_LEGACY), None, True) self._nonblocking = None # delayed self._priority = None # delayed - self._builtin = True class _PerThreadDefaultStream(Stream): def __init__(self): - self._enable_finalize() - self._handle = cuda.CUstream(cuda.CU_STREAM_PER_THREAD) - self._owner = None + self._mnff = Stream._MembersNeededForFinalize(self, cuda.CUstream(cuda.CU_STREAM_PER_THREAD), None, True) self._nonblocking = None # delayed self._priority = None # delayed - self._builtin = True LEGACY_DEFAULT_STREAM = _LegacyDefaultStream() From b7f2cbb04289ca2d58fc947d870e7b6877f03615 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Wed, 27 Nov 2024 12:08:00 -0800 Subject: [PATCH 4/7] Buffer._MembersNeededForFinalize() approach. --- .../core/experimental/_kernel_arg_handler.pyx | 2 +- cuda_core/cuda/core/experimental/_memory.py | 61 +++++++++++-------- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx index 15846282e..698870314 100644 --- a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx @@ -182,7 +182,7 @@ cdef class ParamHolder: for i, arg in enumerate(kernel_args): if isinstance(arg, Buffer): # we need the address of where the actual buffer address is stored - self.data_addresses[i] = (arg._ptr.getPtr()) + self.data_addresses[i] = (arg.handle.getPtr()) continue elif isinstance(arg, int): # Here's the dilemma: We want to have a fast path to pass in Python diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 3bf0377e4..8cc8717ee 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -42,14 +42,28 @@ class Buffer: """ + class _MembersNeededForFinalize: + __slots__ = ("ptr", "size", "mr") + + def __init__(self, buffer_obj, ptr, size, mr): + self.ptr = ptr + self.size = size + self.mr = mr + weakref.finalize(buffer_obj, self.close) + + def close(self, stream=None): + if self.ptr and self.mr is not None: + if stream is None: + stream = default_stream() + self.mr.deallocate(self.ptr, self.size, stream) + self.ptr = 0 + self.mr = None + # TODO: handle ownership? (_mr could be None) - __slots__ = ("__weakref__", "_ptr", "_size", "_mr") + __slots__ = ("__weakref__", "_mnff") def __init__(self, ptr, size, mr: MemoryResource = None): - self._ptr = ptr - self._size = size - self._mr = mr - weakref.finalize(self, self.close) + self._mnff = Buffer._MembersNeededForFinalize(self, ptr, size, mr) def close(self, stream=None): """Deallocate this buffer asynchronously on the given stream. @@ -65,47 +79,42 @@ def close(self, stream=None): the default stream. """ - if self._ptr and self._mr is not None: - if stream is None: - stream = default_stream() - self._mr.deallocate(self._ptr, self._size, stream) - self._ptr = 0 - self._mr = None + self._mnff.close(stream) @property def handle(self): """Return the buffer handle object.""" - return self._ptr + return self._mnff.ptr @property def size(self): """Return the memory size of this buffer.""" - return self._size + return self._mnff.size @property def memory_resource(self) -> MemoryResource: """Return the memory resource associated with this buffer.""" - return self._mr + return self._mnff.mr @property def is_device_accessible(self) -> bool: """Return True if this buffer can be accessed by the GPU, otherwise False.""" - if self._mr is not None: - return self._mr.is_device_accessible + if self._mnff.mr is not None: + return self._mnff.mr.is_device_accessible raise NotImplementedError @property def is_host_accessible(self) -> bool: """Return True if this buffer can be accessed by the CPU, otherwise False.""" - if self._mr is not None: - return self._mr.is_host_accessible + if self._mnff.mr is not None: + return self._mnff.mr.is_host_accessible raise NotImplementedError @property def device_id(self) -> int: """Return the device ordinal of this buffer.""" - if self._mr is not None: - return self._mr.device_id + if self._mnff.mr is not None: + return self._mnff.mr.device_id raise NotImplementedError def copy_to(self, dst: Buffer = None, *, stream) -> Buffer: @@ -127,12 +136,12 @@ def copy_to(self, dst: Buffer = None, *, stream) -> Buffer: if stream is None: raise ValueError("stream must be provided") if dst is None: - if self._mr is None: + if self._mnff.mr is None: raise ValueError("a destination buffer must be provided") - dst = self._mr.allocate(self._size, stream) - if dst._size != self._size: + dst = self._mnff.mr.allocate(self._mnff.size, stream) + if dst._mnff.size != self._mnff.size: raise ValueError("buffer sizes mismatch between src and dst") - handle_return(cuda.cuMemcpyAsync(dst._ptr, self._ptr, self._size, stream.handle)) + handle_return(cuda.cuMemcpyAsync(dst._mnff.ptr, self._mnff.ptr, self._mnff.size, stream.handle)) return dst def copy_from(self, src: Buffer, *, stream): @@ -149,9 +158,9 @@ def copy_from(self, src: Buffer, *, stream): """ if stream is None: raise ValueError("stream must be provided") - if src._size != self._size: + if src._mnff.size != self._mnff.size: raise ValueError("buffer sizes mismatch between src and dst") - handle_return(cuda.cuMemcpyAsync(self._ptr, src._ptr, self._size, stream.handle)) + handle_return(cuda.cuMemcpyAsync(self._mnff.ptr, src._mnff.ptr, self._mnff.size, stream.handle)) def __dlpack__( self, From 26ddbf6d9685c06a1c27bf0187e45a33c18e8ef3 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 30 Nov 2024 11:28:33 -0800 Subject: [PATCH 5/7] Apply _MembersNeededForFinalize pattern to _event.py --- cuda_core/cuda/core/experimental/_event.py | 36 ++++++++++++---------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index 7b0021113..f2dbf74ab 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -51,10 +51,21 @@ class Event: """ - __slots__ = ("__weakref__", "_finalizer", "_handle", "_timing_disabled", "_busy_waited") + class _MembersNeededForFinalize: + __slots__ = ("handle",) + + def __init__(self, event_obj, handle): + self.handle = handle + weakref.finalize(event_obj, self.close) + + def close(self): + if self.handle is not None: + handle_return(cuda.cuEventDestroy(self.handle)) + self.handle = None + + __slots__ = ("__weakref__", "_mnff", "_timing_disabled", "_busy_waited") def __init__(self): - self._handle = None raise NotImplementedError( "directly creating an Event object can be ambiguous. Please call call Stream.record()." ) @@ -62,8 +73,7 @@ def __init__(self): @staticmethod def _init(options: Optional[EventOptions] = None): self = Event.__new__(Event) - # minimal requirements for the destructor - self._handle = None + self._mnff = Event._MembersNeededForFinalize(self, None) options = check_or_create_options(EventOptions, options, "Event options") flags = 0x0 @@ -77,20 +87,12 @@ def _init(options: Optional[EventOptions] = None): self._busy_waited = True if options.support_ipc: raise NotImplementedError("TODO") - self._handle = handle_return(cuda.cuEventCreate(flags)) - self._finalizer = weakref.finalize(self, Event._finalize, self._handle) + self._mnff.handle = handle_return(cuda.cuEventCreate(flags)) return self - @staticmethod - def _finalize(self_handle): - handle_return(cuda.cuEventDestroy(self_handle)) - def close(self): """Destroy the event.""" - if self._handle: - self._finalizer.Detach() - Event._finalize(self._handle) - self._handle = None + self._mnff.close() @property def is_timing_disabled(self) -> bool: @@ -117,12 +119,12 @@ def sync(self): has been completed. """ - handle_return(cuda.cuEventSynchronize(self._handle)) + handle_return(cuda.cuEventSynchronize(self._mnff.handle)) @property def is_done(self) -> bool: """Return True if all captured works have been completed, otherwise False.""" - (result,) = cuda.cuEventQuery(self._handle) + (result,) = cuda.cuEventQuery(self._mnff.handle) if result == cuda.CUresult.CUDA_SUCCESS: return True elif result == cuda.CUresult.CUDA_ERROR_NOT_READY: @@ -133,4 +135,4 @@ def is_done(self) -> bool: @property def handle(self) -> int: """Return the underlying cudaEvent_t pointer address as Python int.""" - return int(self._handle) + return int(self._mnff.handle) From 08aa6ecd1db66b227341ffe232c8c92cadc57fe5 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 30 Nov 2024 11:36:47 -0800 Subject: [PATCH 6/7] _module.py: simply keep TODO comment only --- cuda_core/cuda/core/experimental/_module.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index f67fbfb70..69dbcd374 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import importlib.metadata -import weakref from cuda import cuda from cuda.core.experimental._utils import handle_return @@ -107,7 +106,7 @@ class ObjectCode: """ - __slots__ = ("__weakref__", "_handle", "_code_type", "_module", "_loader", "_sym_map") + __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map") _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin") def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None): @@ -115,7 +114,6 @@ def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None): raise ValueError _lazy_init() self._handle = None - weakref.finalize(self, self.close) backend = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old" self._loader = _backend[backend] @@ -150,9 +148,7 @@ def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None): self._module = module self._sym_map = {} if symbol_mapping is None else symbol_mapping - def close(self): - # TODO: do we want to unload? Probably not.. - pass + # TODO: do we want to unload in a finalizer? Probably not.. def get_kernel(self, name): """Return the :obj:`Kernel` of a specified name from this object code. From b872767fa572b8653a0332d763d8f6a351623093 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 30 Nov 2024 11:45:20 -0800 Subject: [PATCH 7/7] Apply _MembersNeededForFinalize pattern to _program.py --- cuda_core/cuda/core/experimental/_program.py | 41 ++++++++++++-------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 22b9fd81c..28936a888 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -26,32 +26,41 @@ class Program: """ - __slots__ = ("__weakref__", "_handle", "_backend") + class _MembersNeededForFinalize: + __slots__ = ("handle",) + + def __init__(self, program_obj, handle): + self.handle = handle + weakref.finalize(program_obj, self.close) + + def close(self): + if self.handle is not None: + handle_return(nvrtc.nvrtcDestroyProgram(self.handle)) + self.handle = None + + __slots__ = ("__weakref__", "_mnff", "_backend") _supported_code_type = ("c++",) _supported_target_type = ("ptx", "cubin", "ltoir") def __init__(self, code, code_type): + self._mnff = Program._MembersNeededForFinalize(self, None) + if code_type not in self._supported_code_type: raise NotImplementedError - self._handle = None - weakref.finalize(self, self.close) - if code_type.lower() == "c++": if not isinstance(code, str): raise TypeError # TODO: support pre-loaded headers & include names # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved - self._handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) + self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) self._backend = "nvrtc" else: raise NotImplementedError def close(self): """Destroy this program.""" - if self._handle is not None: - handle_return(nvrtc.nvrtcDestroyProgram(self._handle)) - self._handle = None + self._mnff.close() def compile(self, target_type, options=(), name_expressions=(), logs=None): """Compile the program with a specific compilation type. @@ -84,29 +93,29 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None): if self._backend == "nvrtc": if name_expressions: for n in name_expressions: - handle_return(nvrtc.nvrtcAddNameExpression(self._handle, n.encode()), handle=self._handle) + handle_return(nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), handle=self._mnff.handle) # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved options = list(o.encode() for o in options) - handle_return(nvrtc.nvrtcCompileProgram(self._handle, len(options), options), handle=self._handle) + handle_return(nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options), handle=self._mnff.handle) size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size") comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}") - size = handle_return(size_func(self._handle), handle=self._handle) + size = handle_return(size_func(self._mnff.handle), handle=self._mnff.handle) data = b" " * size - handle_return(comp_func(self._handle, data), handle=self._handle) + handle_return(comp_func(self._mnff.handle, data), handle=self._mnff.handle) symbol_mapping = {} if name_expressions: for n in name_expressions: symbol_mapping[n] = handle_return( - nvrtc.nvrtcGetLoweredName(self._handle, n.encode()), handle=self._handle + nvrtc.nvrtcGetLoweredName(self._mnff.handle, n.encode()), handle=self._mnff.handle ) if logs is not None: - logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._handle), handle=self._handle) + logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._mnff.handle), handle=self._mnff.handle) if logsize > 1: log = b" " * logsize - handle_return(nvrtc.nvrtcGetProgramLog(self._handle, log), handle=self._handle) + handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle) logs.write(log.decode()) # TODO: handle jit_options for ptx? @@ -121,4 +130,4 @@ def backend(self): @property def handle(self): """Return the program handle object.""" - return self._handle + return self._mnff.handle