Skip to content

WIP: Cythonize away some perf hot spots #709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,21 @@ class ContextOptions:
pass # TODO


class Context:
__slots__ = ("_handle", "_id")
cdef class Context:

def __new__(self, *args, **kwargs):
cdef:
object _handle
int _device_id

def __init__(self, *args, **kwargs):
raise RuntimeError("Context objects cannot be instantiated directly. Please use Device or Stream APIs.")

@classmethod
def _from_ctx(cls, obj, dev_id):
assert_type(obj, driver.CUcontext)
ctx = super().__new__(cls)
ctx._handle = obj
ctx._id = dev_id
def _from_ctx(cls, handle: driver.CUcontext, int device_id):
cdef Context ctx = Context.__new__(Context)
ctx._handle = handle
ctx._device_id = device_id
return ctx

def __eq__(self, other):
return int(self._handle) == int(other._handle)
6 changes: 4 additions & 2 deletions cuda_core/cuda/core/experimental/_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -1237,7 +1237,6 @@ def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions
"""
return Stream._init(obj=obj, options=options)

@precondition(_check_context_initialized)
def create_event(self, options: Optional[EventOptions] = None) -> Event:
"""Create an Event object without recording it to a Stream.

Expand All @@ -1256,7 +1255,10 @@ def create_event(self, options: Optional[EventOptions] = None) -> Event:
Newly created event object.

"""
return Event._init(self._id, self.context._handle, options)
ctx = driver.cuCtxGetCurrent()[1]
if int(ctx) == 0:
raise CUDAError("No context is bound to the calling CPU thread.")
return Event._init(self._id, ctx, options)

@precondition(_check_context_initialized)
def allocate(self, size, stream: Optional[Stream] = None) -> Buffer:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@

from __future__ import annotations

import weakref
from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional

from cuda.core.experimental._context import Context
from cuda.core.experimental._utils.cuda_utils import (
CUDAError,
check_or_create_options,
driver,
handle_return,
)
Expand All @@ -25,7 +23,7 @@


@dataclass
class EventOptions:
cdef class EventOptions:
"""Customizable :obj:`~_event.Event` options.

Attributes
Expand All @@ -49,7 +47,27 @@ class EventOptions:
support_ipc: Optional[bool] = False


class Event:
cdef inline EventOptions check_or_create_options(options, str options_description):
"""
Create the specified options dataclass from a dictionary of options or None.
"""
cdef EventOptions opts
if options is None:
opts = EventOptions()
elif isinstance(options, dict):
opts = EventOptions(**options)
elif not isinstance(options, EventOptions):
raise TypeError(
f"The {options_description} must be provided as an object "
f"of type {EventOptions.__name__} or as a dict with valid {options_description}. "
f"The provided object is '{options}'."
)

return opts



cdef class Event:
"""Represent a record at a specific point of execution within a CUDA stream.

Applications can asynchronously record events at any point in
Expand Down Expand Up @@ -77,30 +95,20 @@ class Event:
and they should instead be created through a :obj:`~_stream.Stream` object.

"""

class _MembersNeededForFinalize:
__slots__ = ("handle",)

def __init__(self, event_obj, handle):
self.handle = handle
weakref.finalize(event_obj, self.close)

def close(self):
if self.handle is not None:
handle_return(driver.cuEventDestroy(self.handle))
self.handle = None

def __new__(self, *args, **kwargs):
cdef:
object _handle
bint _timing_disabled
bint _busy_waited
int _device_id
object _ctx_handle

def __init__(self, *args, **kwargs):
raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).")

__slots__ = ("__weakref__", "_mnff", "_timing_disabled", "_busy_waited", "_device_id", "_ctx_handle")

@classmethod
def _init(cls, device_id: int, ctx_handle: Context, options: Optional[EventOptions] = None):
self = super().__new__(cls)
self._mnff = Event._MembersNeededForFinalize(self, None)

options = check_or_create_options(EventOptions, options, "Event options")
def _init(cls, device_id: int, ctx_handle: Context, opts=None):
cdef Event self = Event.__new__(Event)
cdef EventOptions options = check_or_create_options(opts, "Event options")
flags = 0x0
self._timing_disabled = False
self._busy_waited = False
Expand All @@ -112,14 +120,22 @@ def _init(cls, device_id: int, ctx_handle: Context, options: Optional[EventOptio
self._busy_waited = True
if options.support_ipc:
raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103")
self._mnff.handle = handle_return(driver.cuEventCreate(flags))
_, self._handle = driver.cuEventCreate(flags)
self._device_id = device_id
self._ctx_handle = ctx_handle
return self

cdef _close(self):
if self._handle is not None:
_ = driver.cuEventDestroy(self._handle)
self._handle = None

def close(self):
"""Destroy the event."""
self._mnff.close()
self._close()

def __dealloc__(self):
self._close()

def __isub__(self, other):
return NotImplemented
Expand All @@ -129,7 +145,7 @@ def __rsub__(self, other):

def __sub__(self, other):
# return self - other (in milliseconds)
err, timing = driver.cuEventElapsedTime(other.handle, self.handle)
err, timing = driver.cuEventElapsedTime(other.handle, self._handle)
try:
raise_if_driver_error(err)
return timing
Expand Down Expand Up @@ -180,12 +196,12 @@ def sync(self):
has been completed.

"""
handle_return(driver.cuEventSynchronize(self._mnff.handle))
handle_return(driver.cuEventSynchronize(self._handle))

@property
def is_done(self) -> bool:
"""Return True if all captured works have been completed, otherwise False."""
(result,) = driver.cuEventQuery(self._mnff.handle)
(result,) = driver.cuEventQuery(self._handle)
if result == driver.CUresult.CUDA_SUCCESS:
return True
if result == driver.CUresult.CUDA_ERROR_NOT_READY:
Expand All @@ -201,7 +217,7 @@ def handle(self) -> cuda.bindings.driver.CUevent:
This handle is a Python object. To get the memory address of the underlying C
handle, call ``int(Event.handle)``.
"""
return self._mnff.handle
return self._handle

@property
def device(self) -> Device:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def _reduce_3_tuple(t: tuple):
return t[0] * t[1] * t[2]


def _check_driver_error(error):
cpdef inline void _check_driver_error(error) except*:
if error == driver.CUresult.CUDA_SUCCESS:
return
name_err, name = driver.cuGetErrorName(error)
Expand All @@ -69,7 +69,7 @@ def _check_driver_error(error):
raise CUDAError(f"{name}: {desc}")


def _check_runtime_error(error):
cpdef inline void _check_runtime_error(error) except*:
if error == runtime.cudaError_t.cudaSuccess:
return
name_err, name = runtime.cudaGetErrorName(error)
Expand All @@ -86,7 +86,7 @@ def _check_runtime_error(error):
raise CUDAError(f"{name}: {desc}")


def _check_error(error, handle=None):
cdef inline void _check_error(error, handle=None) except*:
if isinstance(error, driver.CUresult):
_check_driver_error(error)
elif isinstance(error, runtime.cudaError_t):
Expand All @@ -105,7 +105,7 @@ def _check_error(error, handle=None):
raise RuntimeError(f"Unknown error type: {error}")


def handle_return(result, handle=None):
def handle_return(tuple result, handle=None):
_check_error(result[0], handle=handle)
if len(result) == 1:
return
Expand Down
28 changes: 14 additions & 14 deletions cuda_core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,28 @@
#
# SPDX-License-Identifier: Apache-2.0

import glob
import os

from Cython.Build import cythonize
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext as _build_ext

ext_modules = (
Extension(
"cuda.core.experimental._dlpack",
sources=["cuda/core/experimental/_dlpack.pyx"],
language="c++",
),
Extension(
"cuda.core.experimental._memoryview",
sources=["cuda/core/experimental/_memoryview.pyx"],
language="c++",
),

# It seems setuptools' wildcard support has problems for namespace packages,
# so we explicitly spell out all Extension instances.
root_module = "cuda.core.experimental"
root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep
ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True)
def strip_prefix_suffix(filename):
return filename[len(root_path):-4]
module_names = (strip_prefix_suffix(f) for f in ext_files)
ext_modules = tuple(
Extension(
"cuda.core.experimental._kernel_arg_handler",
sources=["cuda/core/experimental/_kernel_arg_handler.pyx"],
f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
sources=[f"cuda/core/experimental/{mod}.pyx"],
language="c++",
),
) for mod in module_names
)


Expand Down