From c6e0a02adeb13616f27eef6ad4729844bc55d37e Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Thu, 10 Oct 2024 13:39:46 -0700
Subject: [PATCH 01/34] naive approach to adding bindings

---
 cuda/cuda/bindings/_bindings/nvJitLink.pxd    |  26 ++
 .../bindings/_bindings/nvJitLink_linux.pyx    | 382 +++++++++++++++++
 .../bindings/_bindings/nvJitLink_windows.pyx  | 393 ++++++++++++++++++
 cuda/cuda/bindings/cynvJitLink.pxd            |  48 +++
 cuda/cuda/bindings/cynvJitLink.pyx            |  63 +++
 cuda/cuda/bindings/nvJitLink.pxd              |  46 ++
 cuda/cuda/bindings/nvJitLink.pyx              | 138 ++++++
 cuda/cuda/bindings/tests/test_nvJitLink.py    |   3 +
 cuda/setup.py                                 |   3 +-
 9 files changed, 1101 insertions(+), 1 deletion(-)
 create mode 100644 cuda/cuda/bindings/_bindings/nvJitLink.pxd
 create mode 100644 cuda/cuda/bindings/_bindings/nvJitLink_linux.pyx
 create mode 100644 cuda/cuda/bindings/_bindings/nvJitLink_windows.pyx
 create mode 100644 cuda/cuda/bindings/cynvJitLink.pxd
 create mode 100644 cuda/cuda/bindings/cynvJitLink.pyx
 create mode 100644 cuda/cuda/bindings/nvJitLink.pxd
 create mode 100644 cuda/cuda/bindings/nvJitLink.pyx
 create mode 100644 cuda/cuda/bindings/tests/test_nvJitLink.py

diff --git a/cuda/cuda/bindings/_bindings/nvJitLink.pxd b/cuda/cuda/bindings/_bindings/nvJitLink.pxd
new file mode 100644
index 000000000..dca128a0e
--- /dev/null
+++ b/cuda/cuda/bindings/_bindings/nvJitLink.pxd
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from ..cynvJitLink cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil
+cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil
+cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil
+cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil
+cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
diff --git a/cuda/cuda/bindings/_bindings/nvJitLink_linux.pyx b/cuda/cuda/bindings/_bindings/nvJitLink_linux.pyx
new file mode 100644
index 000000000..2fc6ca625
--- /dev/null
+++ b/cuda/cuda/bindings/_bindings/nvJitLink_linux.pyx
@@ -0,0 +1,382 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .utils cimport get_nvJitLink_dso_version_suffix
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+
+###############################################################################
+# Extern
+###############################################################################
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef bint __py_nvJitLink_init = False
+cdef void* __cuDriverGetVersion = NULL
+
+cdef void* __nvJitLinkCreate = NULL
+cdef void* __nvJitLinkDestroy = NULL
+cdef void* __nvJitLinkAddData = NULL
+cdef void* __nvJitLinkAddFile = NULL
+cdef void* __nvJitLinkComplete = NULL
+cdef void* __nvJitLinkGetLinkedCubinSize = NULL
+cdef void* __nvJitLinkGetLinkedCubin = NULL
+cdef void* __nvJitLinkGetLinkedPtxSize = NULL
+cdef void* __nvJitLinkGetLinkedPtx = NULL
+cdef void* __nvJitLinkGetErrorLogSize = NULL
+cdef void* __nvJitLinkGetErrorLog = NULL
+cdef void* __nvJitLinkGetInfoLogSize = NULL
+cdef void* __nvJitLinkGetInfoLog = NULL
+
+
+cdef void* load_library(const int driver_ver) except* with gil:
+    cdef void* handle
+    for suffix in get_nvJitLink_dso_version_suffix(driver_ver):
+        so_name = "libnvJitLink.so" + (f".{suffix}" if suffix else suffix)
+        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
+        if handle != NULL:
+            break
+    else:
+        err_msg = dlerror()
+        raise RuntimeError(f'Failed to dlopen libnvJitLink ({err_msg.decode()})')
+    return handle
+
+
+cdef int _check_or_init_nvJitLink() except -1 nogil:
+    global __py_nvJitLink_init
+    if __py_nvJitLink_init:
+        return 0
+
+    # Load driver to check version
+    cdef void* handle = NULL
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        with gil:
+            err_msg = dlerror()
+            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    global __cuDriverGetVersion
+    if __cuDriverGetVersion == NULL:
+        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if __cuDriverGetVersion == NULL:
+        with gil:
+            raise RuntimeError('something went wrong')
+    cdef int err, driver_ver
+    err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        with gil:
+            raise RuntimeError('something went wrong')
+    #dlclose(handle)
+    handle = NULL
+
+    # Load function
+    global __nvJitLinkCreate
+    __nvJitLinkCreate = dlsym(RTLD_DEFAULT, 'nvJitLinkCreate')
+    if __nvJitLinkCreate == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkCreate = dlsym(handle, 'nvJitLinkCreate')
+    
+    global __nvJitLinkDestroy
+    __nvJitLinkDestroy = dlsym(RTLD_DEFAULT, 'nvJitLinkDestroy')
+    if __nvJitLinkDestroy == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkDestroy = dlsym(handle, 'nvJitLinkDestroy')
+    
+    global __nvJitLinkAddData
+    __nvJitLinkAddData = dlsym(RTLD_DEFAULT, 'nvJitLinkAddData')
+    if __nvJitLinkAddData == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkAddData = dlsym(handle, 'nvJitLinkAddData')
+    
+    global __nvJitLinkAddFile
+    __nvJitLinkAddFile = dlsym(RTLD_DEFAULT, 'nvJitLinkAddFile')
+    if __nvJitLinkAddFile == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkAddFile = dlsym(handle, 'nvJitLinkAddFile')
+    
+    global __nvJitLinkComplete
+    __nvJitLinkComplete = dlsym(RTLD_DEFAULT, 'nvJitLinkComplete')
+    if __nvJitLinkComplete == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkComplete = dlsym(handle, 'nvJitLinkComplete')
+    
+    global __nvJitLinkGetLinkedCubinSize
+    __nvJitLinkGetLinkedCubinSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubinSize')
+    if __nvJitLinkGetLinkedCubinSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedCubinSize = dlsym(handle, 'nvJitLinkGetLinkedCubinSize')
+    
+    global __nvJitLinkGetLinkedCubin
+    __nvJitLinkGetLinkedCubin = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubin')
+    if __nvJitLinkGetLinkedCubin == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedCubin = dlsym(handle, 'nvJitLinkGetLinkedCubin')
+    
+    global __nvJitLinkGetLinkedPtxSize
+    __nvJitLinkGetLinkedPtxSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtxSize')
+    if __nvJitLinkGetLinkedPtxSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedPtxSize = dlsym(handle, 'nvJitLinkGetLinkedPtxSize')
+    
+    global __nvJitLinkGetLinkedPtx
+    __nvJitLinkGetLinkedPtx = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtx')
+    if __nvJitLinkGetLinkedPtx == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedPtx = dlsym(handle, 'nvJitLinkGetLinkedPtx')
+    
+    global __nvJitLinkGetErrorLogSize
+    __nvJitLinkGetErrorLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLogSize')
+    if __nvJitLinkGetErrorLogSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetErrorLogSize = dlsym(handle, 'nvJitLinkGetErrorLogSize')
+    
+    global __nvJitLinkGetErrorLog
+    __nvJitLinkGetErrorLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLog')
+    if __nvJitLinkGetErrorLog == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetErrorLog = dlsym(handle, 'nvJitLinkGetErrorLog')
+    
+    global __nvJitLinkGetInfoLogSize
+    __nvJitLinkGetInfoLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLogSize')
+    if __nvJitLinkGetInfoLogSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetInfoLogSize = dlsym(handle, 'nvJitLinkGetInfoLogSize')
+    
+    global __nvJitLinkGetInfoLog
+    __nvJitLinkGetInfoLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLog')
+    if __nvJitLinkGetInfoLog == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetInfoLog = dlsym(handle, 'nvJitLinkGetInfoLog')
+
+    __py_nvJitLink_init = True
+    return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvJitLink()
+    cdef dict data = {}
+
+    global __nvJitLinkCreate
+    data["__nvJitLinkCreate"] = <intptr_t>__nvJitLinkCreate
+    
+    global __nvJitLinkDestroy
+    data["__nvJitLinkDestroy"] = <intptr_t>__nvJitLinkDestroy
+    
+    global __nvJitLinkAddData
+    data["__nvJitLinkAddData"] = <intptr_t>__nvJitLinkAddData
+    
+    global __nvJitLinkAddFile
+    data["__nvJitLinkAddFile"] = <intptr_t>__nvJitLinkAddFile
+    
+    global __nvJitLinkComplete
+    data["__nvJitLinkComplete"] = <intptr_t>__nvJitLinkComplete
+    
+    global __nvJitLinkGetLinkedCubinSize
+    data["__nvJitLinkGetLinkedCubinSize"] = <intptr_t>__nvJitLinkGetLinkedCubinSize
+    
+    global __nvJitLinkGetLinkedCubin
+    data["__nvJitLinkGetLinkedCubin"] = <intptr_t>__nvJitLinkGetLinkedCubin
+    
+    global __nvJitLinkGetLinkedPtxSize
+    data["__nvJitLinkGetLinkedPtxSize"] = <intptr_t>__nvJitLinkGetLinkedPtxSize
+    
+    global __nvJitLinkGetLinkedPtx
+    data["__nvJitLinkGetLinkedPtx"] = <intptr_t>__nvJitLinkGetLinkedPtx
+    
+    global __nvJitLinkGetErrorLogSize
+    data["__nvJitLinkGetErrorLogSize"] = <intptr_t>__nvJitLinkGetErrorLogSize
+    
+    global __nvJitLinkGetErrorLog
+    data["__nvJitLinkGetErrorLog"] = <intptr_t>__nvJitLinkGetErrorLog
+    
+    global __nvJitLinkGetInfoLogSize
+    data["__nvJitLinkGetInfoLogSize"] = <intptr_t>__nvJitLinkGetInfoLogSize
+    
+    global __nvJitLinkGetInfoLog
+    data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
+    global __nvJitLinkCreate
+    _check_or_init_nvJitLink()
+    if __nvJitLinkCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkCreate is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*, uint32_t, const char**) nogil>__nvJitLinkCreate)(
+        handle, numOptions, options)
+
+
+cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
+    global __nvJitLinkDestroy
+    _check_or_init_nvJitLink()
+    if __nvJitLinkDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*) nogil>__nvJitLinkDestroy)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
+    global __nvJitLinkAddData
+    _check_or_init_nvJitLink()
+    if __nvJitLinkAddData == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddData is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const void*, size_t, const char*) nogil>__nvJitLinkAddData)(
+        handle, inputType, data, size, name)
+
+
+cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
+    global __nvJitLinkAddFile
+    _check_or_init_nvJitLink()
+    if __nvJitLinkAddFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const char*) nogil>__nvJitLinkAddFile)(
+        handle, inputType, fileName)
+
+
+cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
+    global __nvJitLinkComplete
+    _check_or_init_nvJitLink()
+    if __nvJitLinkComplete == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkComplete is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle) nogil>__nvJitLinkComplete)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedCubinSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedCubinSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedCubinSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
+    global __nvJitLinkGetLinkedCubin
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedCubin == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) nogil>__nvJitLinkGetLinkedCubin)(
+        handle, cubin)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedPtxSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedPtxSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedPtxSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
+    global __nvJitLinkGetLinkedPtx
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedPtx == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetLinkedPtx)(
+        handle, ptx)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetErrorLogSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetErrorLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetErrorLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetErrorLog
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetErrorLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetErrorLog)(
+        handle, log)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetInfoLogSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetInfoLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetInfoLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetInfoLog
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetInfoLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
+        handle, log)
diff --git a/cuda/cuda/bindings/_bindings/nvJitLink_windows.pyx b/cuda/cuda/bindings/_bindings/nvJitLink_windows.pyx
new file mode 100644
index 000000000..8856b59ca
--- /dev/null
+++ b/cuda/cuda/bindings/_bindings/nvJitLink_windows.pyx
@@ -0,0 +1,393 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .utils cimport get_nvJitLink_dso_version_suffix
+
+import os
+import site
+
+import win32api
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+cdef bint __py_nvJitLink_init = False
+cdef void* __cuDriverGetVersion = NULL
+
+cdef void* __nvJitLinkCreate = NULL
+cdef void* __nvJitLinkDestroy = NULL
+cdef void* __nvJitLinkAddData = NULL
+cdef void* __nvJitLinkAddFile = NULL
+cdef void* __nvJitLinkComplete = NULL
+cdef void* __nvJitLinkGetLinkedCubinSize = NULL
+cdef void* __nvJitLinkGetLinkedCubin = NULL
+cdef void* __nvJitLinkGetLinkedPtxSize = NULL
+cdef void* __nvJitLinkGetLinkedPtx = NULL
+cdef void* __nvJitLinkGetErrorLogSize = NULL
+cdef void* __nvJitLinkGetErrorLog = NULL
+cdef void* __nvJitLinkGetInfoLogSize = NULL
+cdef void* __nvJitLinkGetInfoLog = NULL
+
+
+cdef inline list get_site_packages():
+    return [site.getusersitepackages()] + site.getsitepackages()
+
+
+cdef load_library(const int driver_ver):
+    handle = 0
+
+    for suffix in get_nvJitLink_dso_version_suffix(driver_ver):
+        if len(suffix) == 0:
+            continue
+        dll_name = f"nvJitLink64_{suffix}.dll"
+
+        # First check if the DLL has been loaded by 3rd parties
+        try:
+            handle = win32api.GetModuleHandle(dll_name)
+        except:
+            pass
+        else:
+            break
+
+        # Next, check if DLLs are installed via pip
+        for sp in get_site_packages():
+            mod_path = os.path.join(sp, "nvidia", "nvJitLink", "bin")
+            if not os.path.isdir(mod_path):
+                continue
+            os.add_dll_directory(mod_path)
+        try:
+            handle = win32api.LoadLibraryEx(
+                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
+                os.path.join(mod_path, dll_name),
+                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
+        except:
+            pass
+        else:
+            break
+
+        # Finally, try default search
+        try:
+            handle = win32api.LoadLibrary(dll_name)
+        except:
+            pass
+        else:
+            break
+    else:
+        raise RuntimeError('Failed to load nvJitLink')
+
+    assert handle != 0
+    return handle
+
+
+cdef int _check_or_init_nvJitLink() except -1 nogil:
+    global __py_nvJitLink_init
+    if __py_nvJitLink_init:
+        return 0
+
+    cdef int err, driver_ver
+    with gil:
+        # Load driver to check version
+        try:
+            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
+        except Exception as e:
+            raise NotSupportedError(f'CUDA driver is not found ({e})')
+        global __cuDriverGetVersion
+        if __cuDriverGetVersion == NULL:
+            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
+            if __cuDriverGetVersion == NULL:
+                raise RuntimeError('something went wrong')
+        err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
+        if err != 0:
+            raise RuntimeError('something went wrong')
+
+        # Load library
+        handle = load_library(driver_ver)
+
+        # Load function
+        global __nvJitLinkCreate
+        try:
+            __nvJitLinkCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkCreate')
+        except:
+            pass
+    
+        global __nvJitLinkDestroy
+        try:
+            __nvJitLinkDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkDestroy')
+        except:
+            pass
+    
+        global __nvJitLinkAddData
+        try:
+            __nvJitLinkAddData = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkAddData')
+        except:
+            pass
+    
+        global __nvJitLinkAddFile
+        try:
+            __nvJitLinkAddFile = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkAddFile')
+        except:
+            pass
+    
+        global __nvJitLinkComplete
+        try:
+            __nvJitLinkComplete = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkComplete')
+        except:
+            pass
+    
+        global __nvJitLinkGetLinkedCubinSize
+        try:
+            __nvJitLinkGetLinkedCubinSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedCubinSize')
+        except:
+            pass
+    
+        global __nvJitLinkGetLinkedCubin
+        try:
+            __nvJitLinkGetLinkedCubin = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedCubin')
+        except:
+            pass
+    
+        global __nvJitLinkGetLinkedPtxSize
+        try:
+            __nvJitLinkGetLinkedPtxSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedPtxSize')
+        except:
+            pass
+    
+        global __nvJitLinkGetLinkedPtx
+        try:
+            __nvJitLinkGetLinkedPtx = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedPtx')
+        except:
+            pass
+    
+        global __nvJitLinkGetErrorLogSize
+        try:
+            __nvJitLinkGetErrorLogSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetErrorLogSize')
+        except:
+            pass
+    
+        global __nvJitLinkGetErrorLog
+        try:
+            __nvJitLinkGetErrorLog = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetErrorLog')
+        except:
+            pass
+    
+        global __nvJitLinkGetInfoLogSize
+        try:
+            __nvJitLinkGetInfoLogSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetInfoLogSize')
+        except:
+            pass
+    
+        global __nvJitLinkGetInfoLog
+        try:
+            __nvJitLinkGetInfoLog = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetInfoLog')
+        except:
+            pass
+
+    __py_nvJitLink_init = True
+    return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvJitLink()
+    cdef dict data = {}
+
+    global __nvJitLinkCreate
+    data["__nvJitLinkCreate"] = <intptr_t>__nvJitLinkCreate
+    
+    global __nvJitLinkDestroy
+    data["__nvJitLinkDestroy"] = <intptr_t>__nvJitLinkDestroy
+    
+    global __nvJitLinkAddData
+    data["__nvJitLinkAddData"] = <intptr_t>__nvJitLinkAddData
+    
+    global __nvJitLinkAddFile
+    data["__nvJitLinkAddFile"] = <intptr_t>__nvJitLinkAddFile
+    
+    global __nvJitLinkComplete
+    data["__nvJitLinkComplete"] = <intptr_t>__nvJitLinkComplete
+    
+    global __nvJitLinkGetLinkedCubinSize
+    data["__nvJitLinkGetLinkedCubinSize"] = <intptr_t>__nvJitLinkGetLinkedCubinSize
+    
+    global __nvJitLinkGetLinkedCubin
+    data["__nvJitLinkGetLinkedCubin"] = <intptr_t>__nvJitLinkGetLinkedCubin
+    
+    global __nvJitLinkGetLinkedPtxSize
+    data["__nvJitLinkGetLinkedPtxSize"] = <intptr_t>__nvJitLinkGetLinkedPtxSize
+    
+    global __nvJitLinkGetLinkedPtx
+    data["__nvJitLinkGetLinkedPtx"] = <intptr_t>__nvJitLinkGetLinkedPtx
+    
+    global __nvJitLinkGetErrorLogSize
+    data["__nvJitLinkGetErrorLogSize"] = <intptr_t>__nvJitLinkGetErrorLogSize
+    
+    global __nvJitLinkGetErrorLog
+    data["__nvJitLinkGetErrorLog"] = <intptr_t>__nvJitLinkGetErrorLog
+    
+    global __nvJitLinkGetInfoLogSize
+    data["__nvJitLinkGetInfoLogSize"] = <intptr_t>__nvJitLinkGetInfoLogSize
+    
+    global __nvJitLinkGetInfoLog
+    data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
+    global __nvJitLinkCreate
+    _check_or_init_nvJitLink()
+    if __nvJitLinkCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkCreate is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*, uint32_t, const char**) nogil>__nvJitLinkCreate)(
+        handle, numOptions, options)
+
+
+cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
+    global __nvJitLinkDestroy
+    _check_or_init_nvJitLink()
+    if __nvJitLinkDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*) nogil>__nvJitLinkDestroy)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
+    global __nvJitLinkAddData
+    _check_or_init_nvJitLink()
+    if __nvJitLinkAddData == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddData is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const void*, size_t, const char*) nogil>__nvJitLinkAddData)(
+        handle, inputType, data, size, name)
+
+
+cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
+    global __nvJitLinkAddFile
+    _check_or_init_nvJitLink()
+    if __nvJitLinkAddFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const char*) nogil>__nvJitLinkAddFile)(
+        handle, inputType, fileName)
+
+
+cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
+    global __nvJitLinkComplete
+    _check_or_init_nvJitLink()
+    if __nvJitLinkComplete == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkComplete is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle) nogil>__nvJitLinkComplete)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedCubinSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedCubinSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedCubinSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
+    global __nvJitLinkGetLinkedCubin
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedCubin == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) nogil>__nvJitLinkGetLinkedCubin)(
+        handle, cubin)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedPtxSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedPtxSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedPtxSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
+    global __nvJitLinkGetLinkedPtx
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedPtx == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetLinkedPtx)(
+        handle, ptx)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetErrorLogSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetErrorLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetErrorLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetErrorLog
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetErrorLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetErrorLog)(
+        handle, log)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetInfoLogSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetInfoLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetInfoLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetInfoLog
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetInfoLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
+        handle, log)
diff --git a/cuda/cuda/bindings/cynvJitLink.pxd b/cuda/cuda/bindings/cynvJitLink.pxd
new file mode 100644
index 000000000..ed440c0b3
--- /dev/null
+++ b/cuda/cuda/bindings/cynvJitLink.pxd
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+
+from libc.stdint cimport int64_t
+
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+# enums
+
+
+
+# types
+cdef extern from *:
+    """
+    #include <driver_types.h>
+    #include <library_types.h>
+    #include <cuComplex.h>
+    """
+    ctypedef void* cudaStream_t 'cudaStream_t'
+
+
+
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef nvJitLinkResult nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil
+cdef nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil
+cdef nvJitLinkResult nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil
+cdef nvJitLinkResult nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil
+cdef nvJitLinkResult nvJitLinkComplete(nvJitLinkHandle handle) except* nogil
+cdef nvJitLinkResult nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil
+cdef nvJitLinkResult nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil
+cdef nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
+cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
diff --git a/cuda/cuda/bindings/cynvJitLink.pyx b/cuda/cuda/bindings/cynvJitLink.pyx
new file mode 100644
index 000000000..65d3f9840
--- /dev/null
+++ b/cuda/cuda/bindings/cynvJitLink.pyx
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from ._internal cimport nvJitLink as _nvJitLink
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
+    return _nvJitLink._nvJitLinkCreate(handle, numOptions, options)
+
+
+cdef nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
+    return _nvJitLink._nvJitLinkDestroy(handle)
+
+
+cdef nvJitLinkResult nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
+    return _nvJitLink._nvJitLinkAddData(handle, inputType, data, size, name)
+
+
+cdef nvJitLinkResult nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
+    return _nvJitLink._nvJitLinkAddFile(handle, inputType, fileName)
+
+
+cdef nvJitLinkResult nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
+    return _nvJitLink._nvJitLinkComplete(handle)
+
+
+cdef nvJitLinkResult nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    return _nvJitLink._nvJitLinkGetLinkedCubinSize(handle, size)
+
+
+cdef nvJitLinkResult nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
+    return _nvJitLink._nvJitLinkGetLinkedCubin(handle, cubin)
+
+
+cdef nvJitLinkResult nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    return _nvJitLink._nvJitLinkGetLinkedPtxSize(handle, size)
+
+
+cdef nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
+    return _nvJitLink._nvJitLinkGetLinkedPtx(handle, ptx)
+
+
+cdef nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    return _nvJitLink._nvJitLinkGetErrorLogSize(handle, size)
+
+
+cdef nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
+    return _nvJitLink._nvJitLinkGetErrorLog(handle, log)
+
+
+cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    return _nvJitLink._nvJitLinkGetInfoLogSize(handle, size)
+
+
+cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
+    return _nvJitLink._nvJitLinkGetInfoLog(handle, log)
diff --git a/cuda/cuda/bindings/nvJitLink.pxd b/cuda/cuda/bindings/nvJitLink.pxd
new file mode 100644
index 000000000..d063002be
--- /dev/null
+++ b/cuda/cuda/bindings/nvJitLink.pxd
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .cynvJitLink cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+
+
+ctypedef cudaStream_t Stream
+ctypedef cudaDataType DataType
+ctypedef libraryPropertyType_t LibraryPropertyType
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef create(intptr_t handle, uint32_t num_options, intptr_t options)
+cpdef destroy(intptr_t handle)
+cpdef add_data(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t data, size_t size, intptr_t name)
+cpdef add_file(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t file_name)
+cpdef complete(nvJitLinkHandle handle)
+cpdef get_linked_cubin_size(nvJitLinkHandle handle, intptr_t size)
+cpdef get_linked_cubin(nvJitLinkHandle handle, intptr_t cubin)
+cpdef get_linked_ptx_size(nvJitLinkHandle handle, intptr_t size)
+cpdef get_linked_ptx(nvJitLinkHandle handle, intptr_t ptx)
+cpdef get_error_log_size(nvJitLinkHandle handle, intptr_t size)
+cpdef get_error_log(nvJitLinkHandle handle, intptr_t log)
+cpdef get_info_log_size(nvJitLinkHandle handle, intptr_t size)
+cpdef get_info_log(nvJitLinkHandle handle, intptr_t log)
diff --git a/cuda/cuda/bindings/nvJitLink.pyx b/cuda/cuda/bindings/nvJitLink.pyx
new file mode 100644
index 000000000..18f4c7545
--- /dev/null
+++ b/cuda/cuda/bindings/nvJitLink.pyx
@@ -0,0 +1,138 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+cimport cython  # NOQA
+
+from enum import IntEnum as _IntEnum
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+cdef dict STATUS={
+    NVJITLINK_SUCCESS                   : 'NVJITLINK_SUCCESS',
+    NVJITLINK_ERROR_UNRECOGNIZED_OPTION : 'NVJITLINK_ERROR_UNRECOGNIZED_OPTION',
+    NVJITLINK_ERROR_MISSING_ARCH        : 'NVJITLINK_ERROR_MISSING_ARCH', // -arch=sm_NN option not specified
+    NVJITLINK_ERROR_INVALID_INPUT       : 'NVJITLINK_ERROR_INVALID_INPUT',
+    NVJITLINK_ERROR_PTX_COMPILE         : 'NVJITLINK_ERROR_PTX_COMPILE',
+    NVJITLINK_ERROR_NVVM_COMPILE        : 'NVJITLINK_ERROR_NVVM_COMPILE',
+    NVJITLINK_ERROR_INTERNAL            : 'NVJITLINK_ERROR_INTERNAL',
+    NVJITLINK_ERROR_THREADPOOL          : 'NVJITLINK_ERROR_THREADPOOL',
+    NVJITLINK_ERROR_UNRECOGNIZED_INPUT  : 'NVJITLINK_ERROR_UNRECOGNIZED_INPUT',
+    NVJITLINK_ERROR_NULL_INPUT          : 'NVJITLINK_ERROR_NULL_INPUT',
+    NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS: 'NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS',
+    NVJITLINK_ERROR_INCORRECT_INPUT_TYPE: 'NVJITLINK_ERROR_INCORRECT_INPUT_TYPE',
+    NVJITLINK_ERROR_ARCH_MISMATCH       : 'NVJITLINK_ERROR_ARCH_MISMATCH',
+    NVJITLINK_ERROR_OUTDATED_LIBRARY    : 'NVJITLINK_ERROR_OUTDATED_LIBRARY',
+    NVJITLINK_ERROR_MISSING_FATBIN      : 'NVJITLINK_ERROR_MISSING_FATBIN'
+}
+
+class nvJitLinkError(Exception):
+
+    def __init__(self, status):
+        self.status = status
+        cdef str err = STATUS[status]
+        super(nvJitLinkError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cdef inline void check_status(int status) nogil:
+    if status != 0:
+        with gil:
+            raise nvJitLinkError(status)
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef create(intptr_t handle, uint32_t num_options, intptr_t options):
+    with nogil:
+        status = nvJitLinkCreate(<nvJitLinkHandle*>handle, num_options, <const char**>options)
+        _check_status(status)
+
+
+cpdef destroy(intptr_t handle):
+    with nogil:
+        status = nvJitLinkDestroy(<nvJitLinkHandle*>handle)
+        _check_status(status)
+
+
+cpdef add_data(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t data, size_t size, intptr_t name):
+    with nogil:
+        status = nvJitLinkAddData(handle, input_type, <const void*>data, size, <const char*>name)
+        _check_status(status)
+
+
+cpdef add_file(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t file_name):
+    with nogil:
+        status = nvJitLinkAddFile(handle, input_type, <const char*>file_name)
+        _check_status(status)
+
+
+cpdef complete(nvJitLinkHandle handle):
+    with nogil:
+        status = nvJitLinkComplete(handle)
+        _check_status(status)
+
+
+cpdef get_linked_cubin_size(nvJitLinkHandle handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetLinkedCubinSize(handle, <size_t*>size)
+        _check_status(status)
+
+
+cpdef get_linked_cubin(nvJitLinkHandle handle, intptr_t cubin):
+    with nogil:
+        status = nvJitLinkGetLinkedCubin(handle, <void*>cubin)
+        _check_status(status)
+
+
+cpdef get_linked_ptx_size(nvJitLinkHandle handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetLinkedPtxSize(handle, <size_t*>size)
+        _check_status(status)
+
+
+cpdef get_linked_ptx(nvJitLinkHandle handle, intptr_t ptx):
+    with nogil:
+        status = nvJitLinkGetLinkedPtx(handle, <char*>ptx)
+        _check_status(status)
+
+
+cpdef get_error_log_size(nvJitLinkHandle handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetErrorLogSize(handle, <size_t*>size)
+        _check_status(status)
+
+
+cpdef get_error_log(nvJitLinkHandle handle, intptr_t log):
+    with nogil:
+        status = nvJitLinkGetErrorLog(handle, <char*>log)
+        _check_status(status)
+
+
+cpdef get_info_log_size(nvJitLinkHandle handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetInfoLogSize(handle, <size_t*>size)
+        _check_status(status)
+
+
+cpdef get_info_log(nvJitLinkHandle handle, intptr_t log):
+    with nogil:
+        status = nvJitLinkGetInfoLog(handle, <char*>log)
+        _check_status(status)
diff --git a/cuda/cuda/bindings/tests/test_nvJitLink.py b/cuda/cuda/bindings/tests/test_nvJitLink.py
new file mode 100644
index 000000000..7ced5ff38
--- /dev/null
+++ b/cuda/cuda/bindings/tests/test_nvJitLink.py
@@ -0,0 +1,3 @@
+import pytest
+from cuda import nvJitLink
+
diff --git a/cuda/setup.py b/cuda/setup.py
index ec5236261..8987151a8 100644
--- a/cuda/setup.py
+++ b/cuda/setup.py
@@ -57,7 +57,8 @@
                  'cuda_egl_interop.h',
                  'cuda_gl_interop.h',
                  'cuda_vdpau_interop.h'],
-    'nvrtc' : ['nvrtc.h']}
+    'nvrtc' : ['nvrtc.h'],
+    'nvJitLink' : ['nvJitLink.h'],}
 
 replace = {' __device_builtin__ ':' ',
            'CUDARTAPI ':' ',

From 47db0c7b35f43ee2a60a2a8b7307afdadecf9f51 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 15 Oct 2024 10:11:07 -0700
Subject: [PATCH 02/34]  add test file

---
 cuda/cuda/bindings/tests/test_nvJitLink.py | 161 +++++++++++++++++++++
 1 file changed, 161 insertions(+)

diff --git a/cuda/cuda/bindings/tests/test_nvJitLink.py b/cuda/cuda/bindings/tests/test_nvJitLink.py
index 7ced5ff38..f566ae7c6 100644
--- a/cuda/cuda/bindings/tests/test_nvJitLink.py
+++ b/cuda/cuda/bindings/tests/test_nvJitLink.py
@@ -1,3 +1,164 @@
 import pytest
 from cuda import nvJitLink
 
+def test_create_no_arch_error():
+    # nvjitlink expects at least the architecture to be specified.
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_MISSING_ARCH error"):
+        nvJitLink.create()
+
+
+def test_invalid_arch_error():
+    # sm_XX is not a valid architecture
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
+        nvJitLink.create("-arch=sm_XX")
+
+
+def test_unrecognized_option_error():
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
+        nvJitLink.create("-fictitious_option")
+
+
+def test_invalid_option_type_error():
+    with pytest.raises(TypeError, match="Expecting only strings"):
+        nvJitLink.create("-arch", 53)
+
+
+def test_create_and_destroy():
+    handle = nvJitLink.create("-arch=sm_53")
+    assert handle != 0
+    nvJitLink.destroy(handle)
+
+
+def test_complete_empty():
+    handle = nvJitLink.create("-arch=sm_75")
+    nvJitLink.complete(handle)
+    nvJitLink.destroy(handle)
+
+
+@pytest.mark.parametrize(
+    "input_file,input_type",
+    [
+        ("device_functions_cubin", nvJitLink.InputType.CUBIN),
+        ("device_functions_fatbin", InputType.FATBIN),
+        ("device_functions_ptx", InputType.PTX),
+        ("device_functions_object", InputType.OBJECT),
+        ("device_functions_archive", InputType.LIBRARY),
+    ],
+)
+def test_add_file(input_file, input_type, gpu_arch_flag, request):
+    filename, data = request.getfixturevalue(input_file)
+
+    handle = nvJitLink.create(gpu_arch_flag)
+    nvJitLink.add_data(handle, input_type.value, data, filename)
+    nvJitLink.destroy(handle)
+
+
+# We test the LTO input case separately as it requires the `-lto` flag. The
+# OBJECT input type is used because the LTO-IR container is packaged in an ELF
+# object when produced by NVCC.
+def test_add_file_lto(device_functions_ltoir_object, gpu_arch_flag):
+    filename, data = device_functions_ltoir_object
+
+    handle = nvJitLink.create(gpu_arch_flag, "-lto")
+    nvJitLink.add_data(handle, InputType.OBJECT.value, data, filename)
+    nvJitLink.destroy(handle)
+
+
+def test_get_error_log(undefined_extern_cubin, gpu_arch_flag):
+    handle = nvJitLink.create(gpu_arch_flag)
+    filename, data = undefined_extern_cubin
+    input_type = InputType.CUBIN.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    with pytest.raises(RuntimeError):
+        nvJitLink.complete(handle)
+    error_log = nvJitLink.get_error_log(handle)
+    nvJitLink.destroy(handle)
+    assert (
+        "Undefined reference to '_Z5undefff' "
+        "in 'undefined_extern.cubin'" in error_log
+    )
+
+
+def test_get_info_log(device_functions_cubin, gpu_arch_flag):
+    handle = nvJitLink.create(gpu_arch_flag)
+    filename, data = device_functions_cubin
+    input_type = InputType.CUBIN.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    nvJitLink.complete(handle)
+    info_log = nvJitLink.get_info_log(handle)
+    nvJitLink.destroy(handle)
+    # Info log is empty
+    assert "" == info_log
+
+
+def test_get_linked_cubin(device_functions_cubin, gpu_arch_flag):
+    handle = nvJitLink.create(gpu_arch_flag)
+    filename, data = device_functions_cubin
+    input_type = InputType.CUBIN.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    nvJitLink.complete(handle)
+    cubin = nvJitLink.get_linked_cubin(handle)
+    nvJitLink.destroy(handle)
+
+    # Just check we got something that looks like an ELF
+    assert cubin[:4] == b"\x7fELF"
+
+
+def test_get_linked_cubin_link_not_complete_error(
+    device_functions_cubin, gpu_arch_flag
+):
+    handle = nvJitLink.create(gpu_arch_flag)
+    filename, data = device_functions_cubin
+    input_type = InputType.CUBIN.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
+        nvJitLink.get_linked_cubin(handle)
+    nvJitLink.destroy(handle)
+
+
+def test_get_linked_cubin_from_lto(device_functions_ltoir_object, gpu_arch_flag):
+    filename, data = device_functions_ltoir_object
+    # device_functions_ltoir_object is a host object containing a fatbin
+    # containing an LTOIR container, because that is what NVCC produces when
+    # LTO is requested. So we need to use the OBJECT input type, and the linker
+    # retrieves the LTO IR from it because we passed the -lto flag.
+    input_type = InputType.OBJECT.value
+    handle = nvJitLink.create(gpu_arch_flag, "-lto")
+    nvJitLink.add_data(handle, input_type, data, filename)
+    nvJitLink.complete(handle)
+    cubin = nvJitLink.get_linked_cubin(handle)
+    nvJitLink.destroy(handle)
+
+    # Just check we got something that looks like an ELF
+    assert cubin[:4] == b"\x7fELF"
+
+
+def test_get_linked_ptx_from_lto(device_functions_ltoir_object, gpu_arch_flag):
+    filename, data = device_functions_ltoir_object
+    # device_functions_ltoir_object is a host object containing a fatbin
+    # containing an LTOIR container, because that is what NVCC produces when
+    # LTO is requested. So we need to use the OBJECT input type, and the linker
+    # retrieves the LTO IR from it because we passed the -lto flag.
+    input_type = InputType.OBJECT.value
+    handle = nvJitLink.create(gpu_arch_flag, "-lto", "-ptx")
+    nvJitLink.add_data(handle, input_type, data, filename)
+    nvJitLink.complete(handle)
+    nvJitLink.get_linked_ptx(handle)
+    nvJitLink.destroy(handle)
+
+
+def test_get_linked_ptx_link_not_complete_error(
+    device_functions_ltoir_object, gpu_arch_flag
+):
+    handle = nvJitLink.create(gpu_arch_flag, "-lto", "-ptx")
+    filename, data = device_functions_ltoir_object
+    input_type = InputType.OBJECT.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
+        nvJitLink.get_linked_ptx(handle)
+    nvJitLink.destroy(handle)
+
+
+def test_package_version():
+    assert pynvjitlink.__version__ is not None
+    assert len(str(pynvjitlink.__version__)) > 0
\ No newline at end of file

From 84efbb02efdcb73bc327a005748fcc799332b2aa Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 15 Oct 2024 12:47:51 -0700
Subject: [PATCH 03/34] rebase

---
 .../cuda/bindings/_bindings/nvJitLink.pxd     |  26 ++
 .../bindings/_bindings/nvJitLink_linux.pyx    | 382 +++++++++++++++++
 .../bindings/_bindings/nvJitLink_windows.pyx  | 393 ++++++++++++++++++
 cuda_bindings/cynvJitLink.pxd                 |  48 +++
 cuda_bindings/cynvJitLink.pyx                 |  63 +++
 cuda_bindings/nvJitLink.pxd                   |  46 ++
 cuda_bindings/nvJitLink.pyx                   | 138 ++++++
 cuda_bindings/setup.py                        |   3 +-
 cuda_bindings/tests/test_nvJitLink.py         |   3 +
 9 files changed, 1101 insertions(+), 1 deletion(-)
 create mode 100644 cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
 create mode 100644 cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
 create mode 100644 cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
 create mode 100644 cuda_bindings/cynvJitLink.pxd
 create mode 100644 cuda_bindings/cynvJitLink.pyx
 create mode 100644 cuda_bindings/nvJitLink.pxd
 create mode 100644 cuda_bindings/nvJitLink.pyx
 create mode 100644 cuda_bindings/tests/test_nvJitLink.py

diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd b/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
new file mode 100644
index 000000000..dca128a0e
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from ..cynvJitLink cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil
+cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil
+cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil
+cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil
+cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx b/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
new file mode 100644
index 000000000..2fc6ca625
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
@@ -0,0 +1,382 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .utils cimport get_nvJitLink_dso_version_suffix
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+
+###############################################################################
+# Extern
+###############################################################################
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef bint __py_nvJitLink_init = False
+cdef void* __cuDriverGetVersion = NULL
+
+cdef void* __nvJitLinkCreate = NULL
+cdef void* __nvJitLinkDestroy = NULL
+cdef void* __nvJitLinkAddData = NULL
+cdef void* __nvJitLinkAddFile = NULL
+cdef void* __nvJitLinkComplete = NULL
+cdef void* __nvJitLinkGetLinkedCubinSize = NULL
+cdef void* __nvJitLinkGetLinkedCubin = NULL
+cdef void* __nvJitLinkGetLinkedPtxSize = NULL
+cdef void* __nvJitLinkGetLinkedPtx = NULL
+cdef void* __nvJitLinkGetErrorLogSize = NULL
+cdef void* __nvJitLinkGetErrorLog = NULL
+cdef void* __nvJitLinkGetInfoLogSize = NULL
+cdef void* __nvJitLinkGetInfoLog = NULL
+
+
+cdef void* load_library(const int driver_ver) except* with gil:
+    cdef void* handle
+    for suffix in get_nvJitLink_dso_version_suffix(driver_ver):
+        so_name = "libnvJitLink.so" + (f".{suffix}" if suffix else suffix)
+        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
+        if handle != NULL:
+            break
+    else:
+        err_msg = dlerror()
+        raise RuntimeError(f'Failed to dlopen libnvJitLink ({err_msg.decode()})')
+    return handle
+
+
+cdef int _check_or_init_nvJitLink() except -1 nogil:
+    global __py_nvJitLink_init
+    if __py_nvJitLink_init:
+        return 0
+
+    # Load driver to check version
+    cdef void* handle = NULL
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        with gil:
+            err_msg = dlerror()
+            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    global __cuDriverGetVersion
+    if __cuDriverGetVersion == NULL:
+        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if __cuDriverGetVersion == NULL:
+        with gil:
+            raise RuntimeError('something went wrong')
+    cdef int err, driver_ver
+    err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        with gil:
+            raise RuntimeError('something went wrong')
+    #dlclose(handle)
+    handle = NULL
+
+    # Load function
+    global __nvJitLinkCreate
+    __nvJitLinkCreate = dlsym(RTLD_DEFAULT, 'nvJitLinkCreate')
+    if __nvJitLinkCreate == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkCreate = dlsym(handle, 'nvJitLinkCreate')
+    
+    global __nvJitLinkDestroy
+    __nvJitLinkDestroy = dlsym(RTLD_DEFAULT, 'nvJitLinkDestroy')
+    if __nvJitLinkDestroy == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkDestroy = dlsym(handle, 'nvJitLinkDestroy')
+    
+    global __nvJitLinkAddData
+    __nvJitLinkAddData = dlsym(RTLD_DEFAULT, 'nvJitLinkAddData')
+    if __nvJitLinkAddData == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkAddData = dlsym(handle, 'nvJitLinkAddData')
+    
+    global __nvJitLinkAddFile
+    __nvJitLinkAddFile = dlsym(RTLD_DEFAULT, 'nvJitLinkAddFile')
+    if __nvJitLinkAddFile == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkAddFile = dlsym(handle, 'nvJitLinkAddFile')
+    
+    global __nvJitLinkComplete
+    __nvJitLinkComplete = dlsym(RTLD_DEFAULT, 'nvJitLinkComplete')
+    if __nvJitLinkComplete == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkComplete = dlsym(handle, 'nvJitLinkComplete')
+    
+    global __nvJitLinkGetLinkedCubinSize
+    __nvJitLinkGetLinkedCubinSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubinSize')
+    if __nvJitLinkGetLinkedCubinSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedCubinSize = dlsym(handle, 'nvJitLinkGetLinkedCubinSize')
+    
+    global __nvJitLinkGetLinkedCubin
+    __nvJitLinkGetLinkedCubin = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubin')
+    if __nvJitLinkGetLinkedCubin == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedCubin = dlsym(handle, 'nvJitLinkGetLinkedCubin')
+    
+    global __nvJitLinkGetLinkedPtxSize
+    __nvJitLinkGetLinkedPtxSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtxSize')
+    if __nvJitLinkGetLinkedPtxSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedPtxSize = dlsym(handle, 'nvJitLinkGetLinkedPtxSize')
+    
+    global __nvJitLinkGetLinkedPtx
+    __nvJitLinkGetLinkedPtx = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtx')
+    if __nvJitLinkGetLinkedPtx == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedPtx = dlsym(handle, 'nvJitLinkGetLinkedPtx')
+    
+    global __nvJitLinkGetErrorLogSize
+    __nvJitLinkGetErrorLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLogSize')
+    if __nvJitLinkGetErrorLogSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetErrorLogSize = dlsym(handle, 'nvJitLinkGetErrorLogSize')
+    
+    global __nvJitLinkGetErrorLog
+    __nvJitLinkGetErrorLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLog')
+    if __nvJitLinkGetErrorLog == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetErrorLog = dlsym(handle, 'nvJitLinkGetErrorLog')
+    
+    global __nvJitLinkGetInfoLogSize
+    __nvJitLinkGetInfoLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLogSize')
+    if __nvJitLinkGetInfoLogSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetInfoLogSize = dlsym(handle, 'nvJitLinkGetInfoLogSize')
+    
+    global __nvJitLinkGetInfoLog
+    __nvJitLinkGetInfoLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLog')
+    if __nvJitLinkGetInfoLog == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetInfoLog = dlsym(handle, 'nvJitLinkGetInfoLog')
+
+    __py_nvJitLink_init = True
+    return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvJitLink()
+    cdef dict data = {}
+
+    global __nvJitLinkCreate
+    data["__nvJitLinkCreate"] = <intptr_t>__nvJitLinkCreate
+    
+    global __nvJitLinkDestroy
+    data["__nvJitLinkDestroy"] = <intptr_t>__nvJitLinkDestroy
+    
+    global __nvJitLinkAddData
+    data["__nvJitLinkAddData"] = <intptr_t>__nvJitLinkAddData
+    
+    global __nvJitLinkAddFile
+    data["__nvJitLinkAddFile"] = <intptr_t>__nvJitLinkAddFile
+    
+    global __nvJitLinkComplete
+    data["__nvJitLinkComplete"] = <intptr_t>__nvJitLinkComplete
+    
+    global __nvJitLinkGetLinkedCubinSize
+    data["__nvJitLinkGetLinkedCubinSize"] = <intptr_t>__nvJitLinkGetLinkedCubinSize
+    
+    global __nvJitLinkGetLinkedCubin
+    data["__nvJitLinkGetLinkedCubin"] = <intptr_t>__nvJitLinkGetLinkedCubin
+    
+    global __nvJitLinkGetLinkedPtxSize
+    data["__nvJitLinkGetLinkedPtxSize"] = <intptr_t>__nvJitLinkGetLinkedPtxSize
+    
+    global __nvJitLinkGetLinkedPtx
+    data["__nvJitLinkGetLinkedPtx"] = <intptr_t>__nvJitLinkGetLinkedPtx
+    
+    global __nvJitLinkGetErrorLogSize
+    data["__nvJitLinkGetErrorLogSize"] = <intptr_t>__nvJitLinkGetErrorLogSize
+    
+    global __nvJitLinkGetErrorLog
+    data["__nvJitLinkGetErrorLog"] = <intptr_t>__nvJitLinkGetErrorLog
+    
+    global __nvJitLinkGetInfoLogSize
+    data["__nvJitLinkGetInfoLogSize"] = <intptr_t>__nvJitLinkGetInfoLogSize
+    
+    global __nvJitLinkGetInfoLog
+    data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
+    global __nvJitLinkCreate
+    _check_or_init_nvJitLink()
+    if __nvJitLinkCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkCreate is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*, uint32_t, const char**) nogil>__nvJitLinkCreate)(
+        handle, numOptions, options)
+
+
+cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
+    global __nvJitLinkDestroy
+    _check_or_init_nvJitLink()
+    if __nvJitLinkDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*) nogil>__nvJitLinkDestroy)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
+    global __nvJitLinkAddData
+    _check_or_init_nvJitLink()
+    if __nvJitLinkAddData == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddData is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const void*, size_t, const char*) nogil>__nvJitLinkAddData)(
+        handle, inputType, data, size, name)
+
+
+cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
+    global __nvJitLinkAddFile
+    _check_or_init_nvJitLink()
+    if __nvJitLinkAddFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const char*) nogil>__nvJitLinkAddFile)(
+        handle, inputType, fileName)
+
+
+cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
+    global __nvJitLinkComplete
+    _check_or_init_nvJitLink()
+    if __nvJitLinkComplete == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkComplete is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle) nogil>__nvJitLinkComplete)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedCubinSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedCubinSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedCubinSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
+    global __nvJitLinkGetLinkedCubin
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedCubin == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) nogil>__nvJitLinkGetLinkedCubin)(
+        handle, cubin)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedPtxSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedPtxSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedPtxSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
+    global __nvJitLinkGetLinkedPtx
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedPtx == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetLinkedPtx)(
+        handle, ptx)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetErrorLogSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetErrorLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetErrorLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetErrorLog
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetErrorLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetErrorLog)(
+        handle, log)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetInfoLogSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetInfoLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetInfoLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetInfoLog
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetInfoLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
+        handle, log)
diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx b/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
new file mode 100644
index 000000000..8856b59ca
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
@@ -0,0 +1,393 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .utils cimport get_nvJitLink_dso_version_suffix
+
+import os
+import site
+
+import win32api
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+cdef bint __py_nvJitLink_init = False
+cdef void* __cuDriverGetVersion = NULL
+
+cdef void* __nvJitLinkCreate = NULL
+cdef void* __nvJitLinkDestroy = NULL
+cdef void* __nvJitLinkAddData = NULL
+cdef void* __nvJitLinkAddFile = NULL
+cdef void* __nvJitLinkComplete = NULL
+cdef void* __nvJitLinkGetLinkedCubinSize = NULL
+cdef void* __nvJitLinkGetLinkedCubin = NULL
+cdef void* __nvJitLinkGetLinkedPtxSize = NULL
+cdef void* __nvJitLinkGetLinkedPtx = NULL
+cdef void* __nvJitLinkGetErrorLogSize = NULL
+cdef void* __nvJitLinkGetErrorLog = NULL
+cdef void* __nvJitLinkGetInfoLogSize = NULL
+cdef void* __nvJitLinkGetInfoLog = NULL
+
+
+cdef inline list get_site_packages():
+    return [site.getusersitepackages()] + site.getsitepackages()
+
+
+cdef load_library(const int driver_ver):
+    handle = 0
+
+    for suffix in get_nvJitLink_dso_version_suffix(driver_ver):
+        if len(suffix) == 0:
+            continue
+        dll_name = f"nvJitLink64_{suffix}.dll"
+
+        # First check if the DLL has been loaded by 3rd parties
+        try:
+            handle = win32api.GetModuleHandle(dll_name)
+        except:
+            pass
+        else:
+            break
+
+        # Next, check if DLLs are installed via pip
+        for sp in get_site_packages():
+            mod_path = os.path.join(sp, "nvidia", "nvJitLink", "bin")
+            if not os.path.isdir(mod_path):
+                continue
+            os.add_dll_directory(mod_path)
+        try:
+            handle = win32api.LoadLibraryEx(
+                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
+                os.path.join(mod_path, dll_name),
+                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
+        except:
+            pass
+        else:
+            break
+
+        # Finally, try default search
+        try:
+            handle = win32api.LoadLibrary(dll_name)
+        except:
+            pass
+        else:
+            break
+    else:
+        raise RuntimeError('Failed to load nvJitLink')
+
+    assert handle != 0
+    return handle
+
+
+cdef int _check_or_init_nvJitLink() except -1 nogil:
+    global __py_nvJitLink_init
+    if __py_nvJitLink_init:
+        return 0
+
+    cdef int err, driver_ver
+    with gil:
+        # Load driver to check version
+        try:
+            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
+        except Exception as e:
+            raise NotSupportedError(f'CUDA driver is not found ({e})')
+        global __cuDriverGetVersion
+        if __cuDriverGetVersion == NULL:
+            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
+            if __cuDriverGetVersion == NULL:
+                raise RuntimeError('something went wrong')
+        err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
+        if err != 0:
+            raise RuntimeError('something went wrong')
+
+        # Load library
+        handle = load_library(driver_ver)
+
+        # Load function
+        global __nvJitLinkCreate
+        try:
+            __nvJitLinkCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkCreate')
+        except:
+            pass
+    
+        global __nvJitLinkDestroy
+        try:
+            __nvJitLinkDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkDestroy')
+        except:
+            pass
+    
+        global __nvJitLinkAddData
+        try:
+            __nvJitLinkAddData = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkAddData')
+        except:
+            pass
+    
+        global __nvJitLinkAddFile
+        try:
+            __nvJitLinkAddFile = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkAddFile')
+        except:
+            pass
+    
+        global __nvJitLinkComplete
+        try:
+            __nvJitLinkComplete = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkComplete')
+        except:
+            pass
+    
+        global __nvJitLinkGetLinkedCubinSize
+        try:
+            __nvJitLinkGetLinkedCubinSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedCubinSize')
+        except:
+            pass
+    
+        global __nvJitLinkGetLinkedCubin
+        try:
+            __nvJitLinkGetLinkedCubin = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedCubin')
+        except:
+            pass
+    
+        global __nvJitLinkGetLinkedPtxSize
+        try:
+            __nvJitLinkGetLinkedPtxSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedPtxSize')
+        except:
+            pass
+    
+        global __nvJitLinkGetLinkedPtx
+        try:
+            __nvJitLinkGetLinkedPtx = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedPtx')
+        except:
+            pass
+    
+        global __nvJitLinkGetErrorLogSize
+        try:
+            __nvJitLinkGetErrorLogSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetErrorLogSize')
+        except:
+            pass
+    
+        global __nvJitLinkGetErrorLog
+        try:
+            __nvJitLinkGetErrorLog = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetErrorLog')
+        except:
+            pass
+    
+        global __nvJitLinkGetInfoLogSize
+        try:
+            __nvJitLinkGetInfoLogSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetInfoLogSize')
+        except:
+            pass
+    
+        global __nvJitLinkGetInfoLog
+        try:
+            __nvJitLinkGetInfoLog = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetInfoLog')
+        except:
+            pass
+
+    __py_nvJitLink_init = True
+    return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvJitLink()
+    cdef dict data = {}
+
+    global __nvJitLinkCreate
+    data["__nvJitLinkCreate"] = <intptr_t>__nvJitLinkCreate
+    
+    global __nvJitLinkDestroy
+    data["__nvJitLinkDestroy"] = <intptr_t>__nvJitLinkDestroy
+    
+    global __nvJitLinkAddData
+    data["__nvJitLinkAddData"] = <intptr_t>__nvJitLinkAddData
+    
+    global __nvJitLinkAddFile
+    data["__nvJitLinkAddFile"] = <intptr_t>__nvJitLinkAddFile
+    
+    global __nvJitLinkComplete
+    data["__nvJitLinkComplete"] = <intptr_t>__nvJitLinkComplete
+    
+    global __nvJitLinkGetLinkedCubinSize
+    data["__nvJitLinkGetLinkedCubinSize"] = <intptr_t>__nvJitLinkGetLinkedCubinSize
+    
+    global __nvJitLinkGetLinkedCubin
+    data["__nvJitLinkGetLinkedCubin"] = <intptr_t>__nvJitLinkGetLinkedCubin
+    
+    global __nvJitLinkGetLinkedPtxSize
+    data["__nvJitLinkGetLinkedPtxSize"] = <intptr_t>__nvJitLinkGetLinkedPtxSize
+    
+    global __nvJitLinkGetLinkedPtx
+    data["__nvJitLinkGetLinkedPtx"] = <intptr_t>__nvJitLinkGetLinkedPtx
+    
+    global __nvJitLinkGetErrorLogSize
+    data["__nvJitLinkGetErrorLogSize"] = <intptr_t>__nvJitLinkGetErrorLogSize
+    
+    global __nvJitLinkGetErrorLog
+    data["__nvJitLinkGetErrorLog"] = <intptr_t>__nvJitLinkGetErrorLog
+    
+    global __nvJitLinkGetInfoLogSize
+    data["__nvJitLinkGetInfoLogSize"] = <intptr_t>__nvJitLinkGetInfoLogSize
+    
+    global __nvJitLinkGetInfoLog
+    data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
+    global __nvJitLinkCreate
+    _check_or_init_nvJitLink()
+    if __nvJitLinkCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkCreate is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*, uint32_t, const char**) nogil>__nvJitLinkCreate)(
+        handle, numOptions, options)
+
+
+cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
+    global __nvJitLinkDestroy
+    _check_or_init_nvJitLink()
+    if __nvJitLinkDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*) nogil>__nvJitLinkDestroy)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
+    global __nvJitLinkAddData
+    _check_or_init_nvJitLink()
+    if __nvJitLinkAddData == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddData is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const void*, size_t, const char*) nogil>__nvJitLinkAddData)(
+        handle, inputType, data, size, name)
+
+
+cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
+    global __nvJitLinkAddFile
+    _check_or_init_nvJitLink()
+    if __nvJitLinkAddFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const char*) nogil>__nvJitLinkAddFile)(
+        handle, inputType, fileName)
+
+
+cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
+    global __nvJitLinkComplete
+    _check_or_init_nvJitLink()
+    if __nvJitLinkComplete == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkComplete is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle) nogil>__nvJitLinkComplete)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedCubinSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedCubinSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedCubinSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
+    global __nvJitLinkGetLinkedCubin
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedCubin == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) nogil>__nvJitLinkGetLinkedCubin)(
+        handle, cubin)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedPtxSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedPtxSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedPtxSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
+    global __nvJitLinkGetLinkedPtx
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedPtx == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetLinkedPtx)(
+        handle, ptx)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetErrorLogSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetErrorLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetErrorLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetErrorLog
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetErrorLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetErrorLog)(
+        handle, log)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetInfoLogSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetInfoLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetInfoLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetInfoLog
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetInfoLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
+        handle, log)
diff --git a/cuda_bindings/cynvJitLink.pxd b/cuda_bindings/cynvJitLink.pxd
new file mode 100644
index 000000000..ed440c0b3
--- /dev/null
+++ b/cuda_bindings/cynvJitLink.pxd
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+
+from libc.stdint cimport int64_t
+
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+# enums
+
+
+
+# types
+cdef extern from *:
+    """
+    #include <driver_types.h>
+    #include <library_types.h>
+    #include <cuComplex.h>
+    """
+    ctypedef void* cudaStream_t 'cudaStream_t'
+
+
+
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef nvJitLinkResult nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil
+cdef nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil
+cdef nvJitLinkResult nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil
+cdef nvJitLinkResult nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil
+cdef nvJitLinkResult nvJitLinkComplete(nvJitLinkHandle handle) except* nogil
+cdef nvJitLinkResult nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil
+cdef nvJitLinkResult nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil
+cdef nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
+cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
diff --git a/cuda_bindings/cynvJitLink.pyx b/cuda_bindings/cynvJitLink.pyx
new file mode 100644
index 000000000..65d3f9840
--- /dev/null
+++ b/cuda_bindings/cynvJitLink.pyx
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from ._internal cimport nvJitLink as _nvJitLink
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
+    return _nvJitLink._nvJitLinkCreate(handle, numOptions, options)
+
+
+cdef nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
+    return _nvJitLink._nvJitLinkDestroy(handle)
+
+
+cdef nvJitLinkResult nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
+    return _nvJitLink._nvJitLinkAddData(handle, inputType, data, size, name)
+
+
+cdef nvJitLinkResult nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
+    return _nvJitLink._nvJitLinkAddFile(handle, inputType, fileName)
+
+
+cdef nvJitLinkResult nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
+    return _nvJitLink._nvJitLinkComplete(handle)
+
+
+cdef nvJitLinkResult nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    return _nvJitLink._nvJitLinkGetLinkedCubinSize(handle, size)
+
+
+cdef nvJitLinkResult nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
+    return _nvJitLink._nvJitLinkGetLinkedCubin(handle, cubin)
+
+
+cdef nvJitLinkResult nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    return _nvJitLink._nvJitLinkGetLinkedPtxSize(handle, size)
+
+
+cdef nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
+    return _nvJitLink._nvJitLinkGetLinkedPtx(handle, ptx)
+
+
+cdef nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    return _nvJitLink._nvJitLinkGetErrorLogSize(handle, size)
+
+
+cdef nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
+    return _nvJitLink._nvJitLinkGetErrorLog(handle, log)
+
+
+cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    return _nvJitLink._nvJitLinkGetInfoLogSize(handle, size)
+
+
+cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
+    return _nvJitLink._nvJitLinkGetInfoLog(handle, log)
diff --git a/cuda_bindings/nvJitLink.pxd b/cuda_bindings/nvJitLink.pxd
new file mode 100644
index 000000000..d063002be
--- /dev/null
+++ b/cuda_bindings/nvJitLink.pxd
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .cynvJitLink cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+
+
+ctypedef cudaStream_t Stream
+ctypedef cudaDataType DataType
+ctypedef libraryPropertyType_t LibraryPropertyType
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef create(intptr_t handle, uint32_t num_options, intptr_t options)
+cpdef destroy(intptr_t handle)
+cpdef add_data(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t data, size_t size, intptr_t name)
+cpdef add_file(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t file_name)
+cpdef complete(nvJitLinkHandle handle)
+cpdef get_linked_cubin_size(nvJitLinkHandle handle, intptr_t size)
+cpdef get_linked_cubin(nvJitLinkHandle handle, intptr_t cubin)
+cpdef get_linked_ptx_size(nvJitLinkHandle handle, intptr_t size)
+cpdef get_linked_ptx(nvJitLinkHandle handle, intptr_t ptx)
+cpdef get_error_log_size(nvJitLinkHandle handle, intptr_t size)
+cpdef get_error_log(nvJitLinkHandle handle, intptr_t log)
+cpdef get_info_log_size(nvJitLinkHandle handle, intptr_t size)
+cpdef get_info_log(nvJitLinkHandle handle, intptr_t log)
diff --git a/cuda_bindings/nvJitLink.pyx b/cuda_bindings/nvJitLink.pyx
new file mode 100644
index 000000000..18f4c7545
--- /dev/null
+++ b/cuda_bindings/nvJitLink.pyx
@@ -0,0 +1,138 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+cimport cython  # NOQA
+
+from enum import IntEnum as _IntEnum
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+cdef dict STATUS={
+    NVJITLINK_SUCCESS                   : 'NVJITLINK_SUCCESS',
+    NVJITLINK_ERROR_UNRECOGNIZED_OPTION : 'NVJITLINK_ERROR_UNRECOGNIZED_OPTION',
+    NVJITLINK_ERROR_MISSING_ARCH        : 'NVJITLINK_ERROR_MISSING_ARCH', // -arch=sm_NN option not specified
+    NVJITLINK_ERROR_INVALID_INPUT       : 'NVJITLINK_ERROR_INVALID_INPUT',
+    NVJITLINK_ERROR_PTX_COMPILE         : 'NVJITLINK_ERROR_PTX_COMPILE',
+    NVJITLINK_ERROR_NVVM_COMPILE        : 'NVJITLINK_ERROR_NVVM_COMPILE',
+    NVJITLINK_ERROR_INTERNAL            : 'NVJITLINK_ERROR_INTERNAL',
+    NVJITLINK_ERROR_THREADPOOL          : 'NVJITLINK_ERROR_THREADPOOL',
+    NVJITLINK_ERROR_UNRECOGNIZED_INPUT  : 'NVJITLINK_ERROR_UNRECOGNIZED_INPUT',
+    NVJITLINK_ERROR_NULL_INPUT          : 'NVJITLINK_ERROR_NULL_INPUT',
+    NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS: 'NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS',
+    NVJITLINK_ERROR_INCORRECT_INPUT_TYPE: 'NVJITLINK_ERROR_INCORRECT_INPUT_TYPE',
+    NVJITLINK_ERROR_ARCH_MISMATCH       : 'NVJITLINK_ERROR_ARCH_MISMATCH',
+    NVJITLINK_ERROR_OUTDATED_LIBRARY    : 'NVJITLINK_ERROR_OUTDATED_LIBRARY',
+    NVJITLINK_ERROR_MISSING_FATBIN      : 'NVJITLINK_ERROR_MISSING_FATBIN'
+}
+
+class nvJitLinkError(Exception):
+
+    def __init__(self, status):
+        self.status = status
+        cdef str err = STATUS[status]
+        super(nvJitLinkError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cdef inline void check_status(int status) nogil:
+    if status != 0:
+        with gil:
+            raise nvJitLinkError(status)
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef create(intptr_t handle, uint32_t num_options, intptr_t options):
+    with nogil:
+        status = nvJitLinkCreate(<nvJitLinkHandle*>handle, num_options, <const char**>options)
+        _check_status(status)
+
+
+cpdef destroy(intptr_t handle):
+    with nogil:
+        status = nvJitLinkDestroy(<nvJitLinkHandle*>handle)
+        _check_status(status)
+
+
+cpdef add_data(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t data, size_t size, intptr_t name):
+    with nogil:
+        status = nvJitLinkAddData(handle, input_type, <const void*>data, size, <const char*>name)
+        _check_status(status)
+
+
+cpdef add_file(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t file_name):
+    with nogil:
+        status = nvJitLinkAddFile(handle, input_type, <const char*>file_name)
+        _check_status(status)
+
+
+cpdef complete(nvJitLinkHandle handle):
+    with nogil:
+        status = nvJitLinkComplete(handle)
+        _check_status(status)
+
+
+cpdef get_linked_cubin_size(nvJitLinkHandle handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetLinkedCubinSize(handle, <size_t*>size)
+        _check_status(status)
+
+
+cpdef get_linked_cubin(nvJitLinkHandle handle, intptr_t cubin):
+    with nogil:
+        status = nvJitLinkGetLinkedCubin(handle, <void*>cubin)
+        _check_status(status)
+
+
+cpdef get_linked_ptx_size(nvJitLinkHandle handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetLinkedPtxSize(handle, <size_t*>size)
+        _check_status(status)
+
+
+cpdef get_linked_ptx(nvJitLinkHandle handle, intptr_t ptx):
+    with nogil:
+        status = nvJitLinkGetLinkedPtx(handle, <char*>ptx)
+        _check_status(status)
+
+
+cpdef get_error_log_size(nvJitLinkHandle handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetErrorLogSize(handle, <size_t*>size)
+        _check_status(status)
+
+
+cpdef get_error_log(nvJitLinkHandle handle, intptr_t log):
+    with nogil:
+        status = nvJitLinkGetErrorLog(handle, <char*>log)
+        _check_status(status)
+
+
+cpdef get_info_log_size(nvJitLinkHandle handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetInfoLogSize(handle, <size_t*>size)
+        _check_status(status)
+
+
+cpdef get_info_log(nvJitLinkHandle handle, intptr_t log):
+    with nogil:
+        status = nvJitLinkGetInfoLog(handle, <char*>log)
+        _check_status(status)
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index fb9d7b953..27b83f946 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -57,7 +57,8 @@
                  'cuda_egl_interop.h',
                  'cuda_gl_interop.h',
                  'cuda_vdpau_interop.h'],
-    'nvrtc' : ['nvrtc.h']}
+    'nvrtc' : ['nvrtc.h'],
+    'nvJitLink' : ['nvJitLink.h'],}
 
 replace = {' __device_builtin__ ':' ',
            'CUDARTAPI ':' ',
diff --git a/cuda_bindings/tests/test_nvJitLink.py b/cuda_bindings/tests/test_nvJitLink.py
new file mode 100644
index 000000000..7ced5ff38
--- /dev/null
+++ b/cuda_bindings/tests/test_nvJitLink.py
@@ -0,0 +1,3 @@
+import pytest
+from cuda import nvJitLink
+

From e893cd2fee43f4c9abb9311ec076c8356d3157bd Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 15 Oct 2024 10:11:07 -0700
Subject: [PATCH 04/34]  add test file

---
 cuda_bindings/tests/test_nvJitLink.py | 161 ++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)

diff --git a/cuda_bindings/tests/test_nvJitLink.py b/cuda_bindings/tests/test_nvJitLink.py
index 7ced5ff38..f566ae7c6 100644
--- a/cuda_bindings/tests/test_nvJitLink.py
+++ b/cuda_bindings/tests/test_nvJitLink.py
@@ -1,3 +1,164 @@
 import pytest
 from cuda import nvJitLink
 
+def test_create_no_arch_error():
+    # nvjitlink expects at least the architecture to be specified.
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_MISSING_ARCH error"):
+        nvJitLink.create()
+
+
+def test_invalid_arch_error():
+    # sm_XX is not a valid architecture
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
+        nvJitLink.create("-arch=sm_XX")
+
+
+def test_unrecognized_option_error():
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
+        nvJitLink.create("-fictitious_option")
+
+
+def test_invalid_option_type_error():
+    with pytest.raises(TypeError, match="Expecting only strings"):
+        nvJitLink.create("-arch", 53)
+
+
+def test_create_and_destroy():
+    handle = nvJitLink.create("-arch=sm_53")
+    assert handle != 0
+    nvJitLink.destroy(handle)
+
+
+def test_complete_empty():
+    handle = nvJitLink.create("-arch=sm_75")
+    nvJitLink.complete(handle)
+    nvJitLink.destroy(handle)
+
+
+@pytest.mark.parametrize(
+    "input_file,input_type",
+    [
+        ("device_functions_cubin", nvJitLink.InputType.CUBIN),
+        ("device_functions_fatbin", InputType.FATBIN),
+        ("device_functions_ptx", InputType.PTX),
+        ("device_functions_object", InputType.OBJECT),
+        ("device_functions_archive", InputType.LIBRARY),
+    ],
+)
+def test_add_file(input_file, input_type, gpu_arch_flag, request):
+    filename, data = request.getfixturevalue(input_file)
+
+    handle = nvJitLink.create(gpu_arch_flag)
+    nvJitLink.add_data(handle, input_type.value, data, filename)
+    nvJitLink.destroy(handle)
+
+
+# We test the LTO input case separately as it requires the `-lto` flag. The
+# OBJECT input type is used because the LTO-IR container is packaged in an ELF
+# object when produced by NVCC.
+def test_add_file_lto(device_functions_ltoir_object, gpu_arch_flag):
+    filename, data = device_functions_ltoir_object
+
+    handle = nvJitLink.create(gpu_arch_flag, "-lto")
+    nvJitLink.add_data(handle, InputType.OBJECT.value, data, filename)
+    nvJitLink.destroy(handle)
+
+
+def test_get_error_log(undefined_extern_cubin, gpu_arch_flag):
+    handle = nvJitLink.create(gpu_arch_flag)
+    filename, data = undefined_extern_cubin
+    input_type = InputType.CUBIN.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    with pytest.raises(RuntimeError):
+        nvJitLink.complete(handle)
+    error_log = nvJitLink.get_error_log(handle)
+    nvJitLink.destroy(handle)
+    assert (
+        "Undefined reference to '_Z5undefff' "
+        "in 'undefined_extern.cubin'" in error_log
+    )
+
+
+def test_get_info_log(device_functions_cubin, gpu_arch_flag):
+    handle = nvJitLink.create(gpu_arch_flag)
+    filename, data = device_functions_cubin
+    input_type = InputType.CUBIN.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    nvJitLink.complete(handle)
+    info_log = nvJitLink.get_info_log(handle)
+    nvJitLink.destroy(handle)
+    # Info log is empty
+    assert "" == info_log
+
+
+def test_get_linked_cubin(device_functions_cubin, gpu_arch_flag):
+    handle = nvJitLink.create(gpu_arch_flag)
+    filename, data = device_functions_cubin
+    input_type = InputType.CUBIN.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    nvJitLink.complete(handle)
+    cubin = nvJitLink.get_linked_cubin(handle)
+    nvJitLink.destroy(handle)
+
+    # Just check we got something that looks like an ELF
+    assert cubin[:4] == b"\x7fELF"
+
+
+def test_get_linked_cubin_link_not_complete_error(
+    device_functions_cubin, gpu_arch_flag
+):
+    handle = nvJitLink.create(gpu_arch_flag)
+    filename, data = device_functions_cubin
+    input_type = InputType.CUBIN.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
+        nvJitLink.get_linked_cubin(handle)
+    nvJitLink.destroy(handle)
+
+
+def test_get_linked_cubin_from_lto(device_functions_ltoir_object, gpu_arch_flag):
+    filename, data = device_functions_ltoir_object
+    # device_functions_ltoir_object is a host object containing a fatbin
+    # containing an LTOIR container, because that is what NVCC produces when
+    # LTO is requested. So we need to use the OBJECT input type, and the linker
+    # retrieves the LTO IR from it because we passed the -lto flag.
+    input_type = InputType.OBJECT.value
+    handle = nvJitLink.create(gpu_arch_flag, "-lto")
+    nvJitLink.add_data(handle, input_type, data, filename)
+    nvJitLink.complete(handle)
+    cubin = nvJitLink.get_linked_cubin(handle)
+    nvJitLink.destroy(handle)
+
+    # Just check we got something that looks like an ELF
+    assert cubin[:4] == b"\x7fELF"
+
+
+def test_get_linked_ptx_from_lto(device_functions_ltoir_object, gpu_arch_flag):
+    filename, data = device_functions_ltoir_object
+    # device_functions_ltoir_object is a host object containing a fatbin
+    # containing an LTOIR container, because that is what NVCC produces when
+    # LTO is requested. So we need to use the OBJECT input type, and the linker
+    # retrieves the LTO IR from it because we passed the -lto flag.
+    input_type = InputType.OBJECT.value
+    handle = nvJitLink.create(gpu_arch_flag, "-lto", "-ptx")
+    nvJitLink.add_data(handle, input_type, data, filename)
+    nvJitLink.complete(handle)
+    nvJitLink.get_linked_ptx(handle)
+    nvJitLink.destroy(handle)
+
+
+def test_get_linked_ptx_link_not_complete_error(
+    device_functions_ltoir_object, gpu_arch_flag
+):
+    handle = nvJitLink.create(gpu_arch_flag, "-lto", "-ptx")
+    filename, data = device_functions_ltoir_object
+    input_type = InputType.OBJECT.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
+        nvJitLink.get_linked_ptx(handle)
+    nvJitLink.destroy(handle)
+
+
+def test_package_version():
+    assert pynvjitlink.__version__ is not None
+    assert len(str(pynvjitlink.__version__)) > 0
\ No newline at end of file

From 5d60eb1e36831156bd5d0b2d636571b2f82e638b Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 16 Oct 2024 15:45:29 -0700
Subject: [PATCH 05/34] more changes

---
 .../cuda/bindings/_internal/__init__.py       |   0
 .../nvJitLink.pxd => _internal/nvjitlink.pxd} |   4 +-
 .../nvjitlink.pyx}                            |  48 +--
 .../bindings/_internal/nvjitlink_linux.pyx    | 382 ++++++++++++++++++
 .../nvjitlink_windows.pyx}                    |  50 +--
 .../cuda/bindings/_internal/utils.pxd         | 172 ++++++++
 .../cuda/bindings/_internal/utils.pyx         | 139 +++++++
 .../bindings/cynvjitlink.pxd}                 |  29 +-
 .../bindings/cynvjitlink.pyx}                 |  30 +-
 cuda_bindings/cuda/bindings/nvjitlink.pxd     |  43 ++
 cuda_bindings/cuda/bindings/nvjitlink.pyx     | 153 +++++++
 cuda_bindings/nvJitLink.pxd                   |  46 ---
 cuda_bindings/nvJitLink.pyx                   | 138 -------
 cuda_bindings/setup.py                        |  64 ++-
 .../{test_nvJitLink.py => test_nvjitlink.py}  | 102 ++---
 15 files changed, 1086 insertions(+), 314 deletions(-)
 create mode 100644 cuda_bindings/cuda/bindings/_internal/__init__.py
 rename cuda_bindings/cuda/bindings/{_bindings/nvJitLink.pxd => _internal/nvjitlink.pxd} (95%)
 rename cuda_bindings/cuda/bindings/{_bindings/nvJitLink_linux.pyx => _internal/nvjitlink.pyx} (93%)
 create mode 100644 cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
 rename cuda_bindings/cuda/bindings/{_bindings/nvJitLink_windows.pyx => _internal/nvjitlink_windows.pyx} (93%)
 create mode 100644 cuda_bindings/cuda/bindings/_internal/utils.pxd
 create mode 100644 cuda_bindings/cuda/bindings/_internal/utils.pyx
 rename cuda_bindings/{cynvJitLink.pxd => cuda/bindings/cynvjitlink.pxd} (60%)
 rename cuda_bindings/{cynvJitLink.pyx => cuda/bindings/cynvjitlink.pyx} (66%)
 create mode 100644 cuda_bindings/cuda/bindings/nvjitlink.pxd
 create mode 100644 cuda_bindings/cuda/bindings/nvjitlink.pyx
 delete mode 100644 cuda_bindings/nvJitLink.pxd
 delete mode 100644 cuda_bindings/nvJitLink.pyx
 rename cuda_bindings/tests/{test_nvJitLink.py => test_nvjitlink.py} (62%)

diff --git a/cuda_bindings/cuda/bindings/_internal/__init__.py b/cuda_bindings/cuda/bindings/_internal/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
similarity index 95%
rename from cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
rename to cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
index dca128a0e..ac3a9023b 100644
--- a/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
 
-from ..cynvJitLink cimport *
+from ..cynvjitlink cimport *
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pyx
similarity index 93%
rename from cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
rename to cuda_bindings/cuda/bindings/_internal/nvjitlink.pyx
index 2fc6ca625..ff7a6ca3a 100644
--- a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pyx
@@ -2,11 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
-from .utils cimport get_nvJitLink_dso_version_suffix
+from .utils cimport get_nvjitlink_dso_version_suffix
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
@@ -34,7 +34,7 @@ cdef extern from "<dlfcn.h>" nogil:
 # Wrapper init
 ###############################################################################
 
-cdef bint __py_nvJitLink_init = False
+cdef bint __py_nvjitlink_init = False
 cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __nvJitLinkCreate = NULL
@@ -54,20 +54,20 @@ cdef void* __nvJitLinkGetInfoLog = NULL
 
 cdef void* load_library(const int driver_ver) except* with gil:
     cdef void* handle
-    for suffix in get_nvJitLink_dso_version_suffix(driver_ver):
-        so_name = "libnvJitLink.so" + (f".{suffix}" if suffix else suffix)
+    for suffix in get_nvjitlink_dso_version_suffix(driver_ver):
+        so_name = "libnvjitlink.so" + (f".{suffix}" if suffix else suffix)
         handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
         if handle != NULL:
             break
     else:
         err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libnvJitLink ({err_msg.decode()})')
+        raise RuntimeError(f'Failed to dlopen libnvjitlink ({err_msg.decode()})')
     return handle
 
 
-cdef int _check_or_init_nvJitLink() except -1 nogil:
-    global __py_nvJitLink_init
-    if __py_nvJitLink_init:
+cdef int _check_or_init_nvjitlink() except -1 nogil:
+    global __py_nvjitlink_init
+    if __py_nvjitlink_init:
         return 0
 
     # Load driver to check version
@@ -183,7 +183,7 @@ cdef int _check_or_init_nvJitLink() except -1 nogil:
             handle = load_library(driver_ver)
         __nvJitLinkGetInfoLog = dlsym(handle, 'nvJitLinkGetInfoLog')
 
-    __py_nvJitLink_init = True
+    __py_nvjitlink_init = True
     return 0
 
 
@@ -195,7 +195,7 @@ cpdef dict _inspect_function_pointers():
     if func_ptrs is not None:
         return func_ptrs
 
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     cdef dict data = {}
 
     global __nvJitLinkCreate
@@ -254,7 +254,7 @@ cpdef _inspect_function_pointer(str name):
 
 cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
     global __nvJitLinkCreate
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkCreate == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkCreate is not found")
@@ -264,7 +264,7 @@ cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptio
 
 cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
     global __nvJitLinkDestroy
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkDestroy == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
@@ -274,7 +274,7 @@ cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
 
 cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
     global __nvJitLinkAddData
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkAddData == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkAddData is not found")
@@ -284,7 +284,7 @@ cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputTyp
 
 cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
     global __nvJitLinkAddFile
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkAddFile == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
@@ -294,7 +294,7 @@ cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputTyp
 
 cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
     global __nvJitLinkComplete
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkComplete == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkComplete is not found")
@@ -304,7 +304,7 @@ cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
 
 cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
     global __nvJitLinkGetLinkedCubinSize
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetLinkedCubinSize == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
@@ -314,7 +314,7 @@ cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t
 
 cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
     global __nvJitLinkGetLinkedCubin
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetLinkedCubin == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
@@ -324,7 +324,7 @@ cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubi
 
 cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
     global __nvJitLinkGetLinkedPtxSize
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetLinkedPtxSize == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
@@ -334,7 +334,7 @@ cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t*
 
 cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
     global __nvJitLinkGetLinkedPtx
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetLinkedPtx == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
@@ -344,7 +344,7 @@ cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) e
 
 cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
     global __nvJitLinkGetErrorLogSize
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetErrorLogSize == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
@@ -354,7 +354,7 @@ cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* s
 
 cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
     global __nvJitLinkGetErrorLog
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetErrorLog == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
@@ -364,7 +364,7 @@ cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) ex
 
 cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
     global __nvJitLinkGetInfoLogSize
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetInfoLogSize == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
@@ -374,7 +374,7 @@ cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* si
 
 cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
     global __nvJitLinkGetInfoLog
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetInfoLog == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
new file mode 100644
index 000000000..ff7a6ca3a
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -0,0 +1,382 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .utils cimport get_nvjitlink_dso_version_suffix
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+
+###############################################################################
+# Extern
+###############################################################################
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef bint __py_nvjitlink_init = False
+cdef void* __cuDriverGetVersion = NULL
+
+cdef void* __nvJitLinkCreate = NULL
+cdef void* __nvJitLinkDestroy = NULL
+cdef void* __nvJitLinkAddData = NULL
+cdef void* __nvJitLinkAddFile = NULL
+cdef void* __nvJitLinkComplete = NULL
+cdef void* __nvJitLinkGetLinkedCubinSize = NULL
+cdef void* __nvJitLinkGetLinkedCubin = NULL
+cdef void* __nvJitLinkGetLinkedPtxSize = NULL
+cdef void* __nvJitLinkGetLinkedPtx = NULL
+cdef void* __nvJitLinkGetErrorLogSize = NULL
+cdef void* __nvJitLinkGetErrorLog = NULL
+cdef void* __nvJitLinkGetInfoLogSize = NULL
+cdef void* __nvJitLinkGetInfoLog = NULL
+
+
+cdef void* load_library(const int driver_ver) except* with gil:
+    cdef void* handle
+    for suffix in get_nvjitlink_dso_version_suffix(driver_ver):
+        so_name = "libnvjitlink.so" + (f".{suffix}" if suffix else suffix)
+        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
+        if handle != NULL:
+            break
+    else:
+        err_msg = dlerror()
+        raise RuntimeError(f'Failed to dlopen libnvjitlink ({err_msg.decode()})')
+    return handle
+
+
+cdef int _check_or_init_nvjitlink() except -1 nogil:
+    global __py_nvjitlink_init
+    if __py_nvjitlink_init:
+        return 0
+
+    # Load driver to check version
+    cdef void* handle = NULL
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        with gil:
+            err_msg = dlerror()
+            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    global __cuDriverGetVersion
+    if __cuDriverGetVersion == NULL:
+        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if __cuDriverGetVersion == NULL:
+        with gil:
+            raise RuntimeError('something went wrong')
+    cdef int err, driver_ver
+    err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        with gil:
+            raise RuntimeError('something went wrong')
+    #dlclose(handle)
+    handle = NULL
+
+    # Load function
+    global __nvJitLinkCreate
+    __nvJitLinkCreate = dlsym(RTLD_DEFAULT, 'nvJitLinkCreate')
+    if __nvJitLinkCreate == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkCreate = dlsym(handle, 'nvJitLinkCreate')
+    
+    global __nvJitLinkDestroy
+    __nvJitLinkDestroy = dlsym(RTLD_DEFAULT, 'nvJitLinkDestroy')
+    if __nvJitLinkDestroy == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkDestroy = dlsym(handle, 'nvJitLinkDestroy')
+    
+    global __nvJitLinkAddData
+    __nvJitLinkAddData = dlsym(RTLD_DEFAULT, 'nvJitLinkAddData')
+    if __nvJitLinkAddData == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkAddData = dlsym(handle, 'nvJitLinkAddData')
+    
+    global __nvJitLinkAddFile
+    __nvJitLinkAddFile = dlsym(RTLD_DEFAULT, 'nvJitLinkAddFile')
+    if __nvJitLinkAddFile == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkAddFile = dlsym(handle, 'nvJitLinkAddFile')
+    
+    global __nvJitLinkComplete
+    __nvJitLinkComplete = dlsym(RTLD_DEFAULT, 'nvJitLinkComplete')
+    if __nvJitLinkComplete == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkComplete = dlsym(handle, 'nvJitLinkComplete')
+    
+    global __nvJitLinkGetLinkedCubinSize
+    __nvJitLinkGetLinkedCubinSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubinSize')
+    if __nvJitLinkGetLinkedCubinSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedCubinSize = dlsym(handle, 'nvJitLinkGetLinkedCubinSize')
+    
+    global __nvJitLinkGetLinkedCubin
+    __nvJitLinkGetLinkedCubin = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubin')
+    if __nvJitLinkGetLinkedCubin == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedCubin = dlsym(handle, 'nvJitLinkGetLinkedCubin')
+    
+    global __nvJitLinkGetLinkedPtxSize
+    __nvJitLinkGetLinkedPtxSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtxSize')
+    if __nvJitLinkGetLinkedPtxSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedPtxSize = dlsym(handle, 'nvJitLinkGetLinkedPtxSize')
+    
+    global __nvJitLinkGetLinkedPtx
+    __nvJitLinkGetLinkedPtx = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtx')
+    if __nvJitLinkGetLinkedPtx == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedPtx = dlsym(handle, 'nvJitLinkGetLinkedPtx')
+    
+    global __nvJitLinkGetErrorLogSize
+    __nvJitLinkGetErrorLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLogSize')
+    if __nvJitLinkGetErrorLogSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetErrorLogSize = dlsym(handle, 'nvJitLinkGetErrorLogSize')
+    
+    global __nvJitLinkGetErrorLog
+    __nvJitLinkGetErrorLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLog')
+    if __nvJitLinkGetErrorLog == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetErrorLog = dlsym(handle, 'nvJitLinkGetErrorLog')
+    
+    global __nvJitLinkGetInfoLogSize
+    __nvJitLinkGetInfoLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLogSize')
+    if __nvJitLinkGetInfoLogSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetInfoLogSize = dlsym(handle, 'nvJitLinkGetInfoLogSize')
+    
+    global __nvJitLinkGetInfoLog
+    __nvJitLinkGetInfoLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLog')
+    if __nvJitLinkGetInfoLog == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetInfoLog = dlsym(handle, 'nvJitLinkGetInfoLog')
+
+    __py_nvjitlink_init = True
+    return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvjitlink()
+    cdef dict data = {}
+
+    global __nvJitLinkCreate
+    data["__nvJitLinkCreate"] = <intptr_t>__nvJitLinkCreate
+    
+    global __nvJitLinkDestroy
+    data["__nvJitLinkDestroy"] = <intptr_t>__nvJitLinkDestroy
+    
+    global __nvJitLinkAddData
+    data["__nvJitLinkAddData"] = <intptr_t>__nvJitLinkAddData
+    
+    global __nvJitLinkAddFile
+    data["__nvJitLinkAddFile"] = <intptr_t>__nvJitLinkAddFile
+    
+    global __nvJitLinkComplete
+    data["__nvJitLinkComplete"] = <intptr_t>__nvJitLinkComplete
+    
+    global __nvJitLinkGetLinkedCubinSize
+    data["__nvJitLinkGetLinkedCubinSize"] = <intptr_t>__nvJitLinkGetLinkedCubinSize
+    
+    global __nvJitLinkGetLinkedCubin
+    data["__nvJitLinkGetLinkedCubin"] = <intptr_t>__nvJitLinkGetLinkedCubin
+    
+    global __nvJitLinkGetLinkedPtxSize
+    data["__nvJitLinkGetLinkedPtxSize"] = <intptr_t>__nvJitLinkGetLinkedPtxSize
+    
+    global __nvJitLinkGetLinkedPtx
+    data["__nvJitLinkGetLinkedPtx"] = <intptr_t>__nvJitLinkGetLinkedPtx
+    
+    global __nvJitLinkGetErrorLogSize
+    data["__nvJitLinkGetErrorLogSize"] = <intptr_t>__nvJitLinkGetErrorLogSize
+    
+    global __nvJitLinkGetErrorLog
+    data["__nvJitLinkGetErrorLog"] = <intptr_t>__nvJitLinkGetErrorLog
+    
+    global __nvJitLinkGetInfoLogSize
+    data["__nvJitLinkGetInfoLogSize"] = <intptr_t>__nvJitLinkGetInfoLogSize
+    
+    global __nvJitLinkGetInfoLog
+    data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
+    global __nvJitLinkCreate
+    _check_or_init_nvjitlink()
+    if __nvJitLinkCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkCreate is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*, uint32_t, const char**) nogil>__nvJitLinkCreate)(
+        handle, numOptions, options)
+
+
+cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
+    global __nvJitLinkDestroy
+    _check_or_init_nvjitlink()
+    if __nvJitLinkDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*) nogil>__nvJitLinkDestroy)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
+    global __nvJitLinkAddData
+    _check_or_init_nvjitlink()
+    if __nvJitLinkAddData == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddData is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const void*, size_t, const char*) nogil>__nvJitLinkAddData)(
+        handle, inputType, data, size, name)
+
+
+cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
+    global __nvJitLinkAddFile
+    _check_or_init_nvjitlink()
+    if __nvJitLinkAddFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const char*) nogil>__nvJitLinkAddFile)(
+        handle, inputType, fileName)
+
+
+cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
+    global __nvJitLinkComplete
+    _check_or_init_nvjitlink()
+    if __nvJitLinkComplete == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkComplete is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle) nogil>__nvJitLinkComplete)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedCubinSize
+    _check_or_init_nvjitlink()
+    if __nvJitLinkGetLinkedCubinSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedCubinSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
+    global __nvJitLinkGetLinkedCubin
+    _check_or_init_nvjitlink()
+    if __nvJitLinkGetLinkedCubin == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) nogil>__nvJitLinkGetLinkedCubin)(
+        handle, cubin)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedPtxSize
+    _check_or_init_nvjitlink()
+    if __nvJitLinkGetLinkedPtxSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedPtxSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
+    global __nvJitLinkGetLinkedPtx
+    _check_or_init_nvjitlink()
+    if __nvJitLinkGetLinkedPtx == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetLinkedPtx)(
+        handle, ptx)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetErrorLogSize
+    _check_or_init_nvjitlink()
+    if __nvJitLinkGetErrorLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetErrorLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetErrorLog
+    _check_or_init_nvjitlink()
+    if __nvJitLinkGetErrorLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetErrorLog)(
+        handle, log)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetInfoLogSize
+    _check_or_init_nvjitlink()
+    if __nvJitLinkGetInfoLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetInfoLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetInfoLog
+    _check_or_init_nvjitlink()
+    if __nvJitLinkGetInfoLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
+        handle, log)
diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
similarity index 93%
rename from cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
rename to cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index 8856b59ca..43852441e 100644
--- a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -2,11 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
-from .utils cimport get_nvJitLink_dso_version_suffix
+from .utils cimport get_nvjitlink_dso_version_suffix
 
 import os
 import site
@@ -23,7 +23,7 @@ from .utils import FunctionNotFoundError, NotSupportedError
 LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
 LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
 LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
-cdef bint __py_nvJitLink_init = False
+cdef bint __py_nvjitlink_init = False
 cdef void* __cuDriverGetVersion = NULL
 
 cdef void* __nvJitLinkCreate = NULL
@@ -48,10 +48,10 @@ cdef inline list get_site_packages():
 cdef load_library(const int driver_ver):
     handle = 0
 
-    for suffix in get_nvJitLink_dso_version_suffix(driver_ver):
+    for suffix in get_nvjitlink_dso_version_suffix(driver_ver):
         if len(suffix) == 0:
             continue
-        dll_name = f"nvJitLink64_{suffix}.dll"
+        dll_name = f"nvjitlink64_{suffix}.dll"
 
         # First check if the DLL has been loaded by 3rd parties
         try:
@@ -63,7 +63,7 @@ cdef load_library(const int driver_ver):
 
         # Next, check if DLLs are installed via pip
         for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "nvJitLink", "bin")
+            mod_path = os.path.join(sp, "nvidia", "nvjitlink", "bin")
             if not os.path.isdir(mod_path):
                 continue
             os.add_dll_directory(mod_path)
@@ -85,15 +85,15 @@ cdef load_library(const int driver_ver):
         else:
             break
     else:
-        raise RuntimeError('Failed to load nvJitLink')
+        raise RuntimeError('Failed to load nvjitlink')
 
     assert handle != 0
     return handle
 
 
-cdef int _check_or_init_nvJitLink() except -1 nogil:
-    global __py_nvJitLink_init
-    if __py_nvJitLink_init:
+cdef int _check_or_init_nvjitlink() except -1 nogil:
+    global __py_nvjitlink_init
+    if __py_nvjitlink_init:
         return 0
 
     cdef int err, driver_ver
@@ -194,7 +194,7 @@ cdef int _check_or_init_nvJitLink() except -1 nogil:
         except:
             pass
 
-    __py_nvJitLink_init = True
+    __py_nvjitlink_init = True
     return 0
 
 
@@ -206,7 +206,7 @@ cpdef dict _inspect_function_pointers():
     if func_ptrs is not None:
         return func_ptrs
 
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     cdef dict data = {}
 
     global __nvJitLinkCreate
@@ -265,7 +265,7 @@ cpdef _inspect_function_pointer(str name):
 
 cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
     global __nvJitLinkCreate
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkCreate == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkCreate is not found")
@@ -275,7 +275,7 @@ cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptio
 
 cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
     global __nvJitLinkDestroy
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkDestroy == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
@@ -285,7 +285,7 @@ cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
 
 cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
     global __nvJitLinkAddData
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkAddData == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkAddData is not found")
@@ -295,7 +295,7 @@ cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputTyp
 
 cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
     global __nvJitLinkAddFile
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkAddFile == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
@@ -305,7 +305,7 @@ cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputTyp
 
 cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
     global __nvJitLinkComplete
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkComplete == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkComplete is not found")
@@ -315,7 +315,7 @@ cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
 
 cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
     global __nvJitLinkGetLinkedCubinSize
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetLinkedCubinSize == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
@@ -325,7 +325,7 @@ cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t
 
 cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
     global __nvJitLinkGetLinkedCubin
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetLinkedCubin == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
@@ -335,7 +335,7 @@ cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubi
 
 cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
     global __nvJitLinkGetLinkedPtxSize
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetLinkedPtxSize == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
@@ -345,7 +345,7 @@ cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t*
 
 cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
     global __nvJitLinkGetLinkedPtx
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetLinkedPtx == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
@@ -355,7 +355,7 @@ cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) e
 
 cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
     global __nvJitLinkGetErrorLogSize
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetErrorLogSize == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
@@ -365,7 +365,7 @@ cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* s
 
 cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
     global __nvJitLinkGetErrorLog
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetErrorLog == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
@@ -375,7 +375,7 @@ cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) ex
 
 cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
     global __nvJitLinkGetInfoLogSize
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetInfoLogSize == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
@@ -385,7 +385,7 @@ cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* si
 
 cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
     global __nvJitLinkGetInfoLog
-    _check_or_init_nvJitLink()
+    _check_or_init_nvjitlink()
     if __nvJitLinkGetInfoLog == NULL:
         with gil:
             raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pxd b/cuda_bindings/cuda/bindings/_internal/utils.pxd
new file mode 100644
index 000000000..225ab3648
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pxd
@@ -0,0 +1,172 @@
+from libc.stdint cimport int32_t, int64_t, intptr_t
+from libcpp.vector cimport vector
+from libcpp cimport bool as cppbool
+from libcpp cimport nullptr_t, nullptr
+from libcpp.memory cimport unique_ptr
+
+
+cdef extern from * nogil:
+    """
+    template<typename T>
+    class nullable_unique_ptr {
+      public:
+        nullable_unique_ptr() noexcept = default;
+
+        nullable_unique_ptr(std::nullptr_t) noexcept = delete;
+
+        explicit nullable_unique_ptr(T* data, bool own_data):
+            own_data_(own_data)
+        {
+            if (own_data)
+                manager_.reset(data);
+            else
+                raw_data_ = data;       
+        }
+
+        nullable_unique_ptr(const nullable_unique_ptr&) = delete;
+
+        nullable_unique_ptr& operator=(const nullable_unique_ptr&) = delete;
+
+        nullable_unique_ptr(nullable_unique_ptr&& other) noexcept
+        {
+            own_data_ = other.own_data_;
+            other.own_data_ = false;  // ownership is transferred
+            if (own_data_)
+            {
+                manager_ = std::move(other.manager_);
+                raw_data_ = nullptr;  // just in case
+            }   
+            else
+            {
+                manager_.reset(nullptr);  // just in case
+                raw_data_ = other.raw_data_;
+            }
+        }
+
+        nullable_unique_ptr& operator=(nullable_unique_ptr&& other) noexcept
+        {
+            own_data_ = other.own_data_;
+            other.own_data_ = false;  // ownership is transferred
+            if (own_data_)
+            {
+                manager_ = std::move(other.manager_);
+                raw_data_ = nullptr;  // just in case
+            }   
+            else
+            {
+                manager_.reset(nullptr);  // just in case
+                raw_data_ = other.raw_data_;
+            }
+            return *this;
+        }
+
+        ~nullable_unique_ptr() = default;
+
+        void reset(T* data, bool own_data)
+        {
+            own_data_ = own_data;
+            if (own_data_)
+            {
+                manager_.reset(data);
+                raw_data_ = nullptr;
+            }
+            else
+            {
+                manager_.reset(nullptr);
+                raw_data_ = data;
+            }
+        }
+
+        void swap(nullable_unique_ptr& other) noexcept
+        {
+            std::swap(manager_, other.manager_);
+            std::swap(raw_data_, other.raw_data_);
+            std::swap(own_data_, other.own_data_);
+        }
+
+        /*
+         * Get the pointer to the underlying object (this is different from data()!).
+         */
+        T* get() const noexcept
+        {
+            if (own_data_)
+                return manager_.get();
+            else
+                return raw_data_;
+        }
+
+        /*
+         * Get the pointer to the underlying buffer (this is different from get()!).
+         */
+        void* data() noexcept
+        {
+            if (own_data_)
+                return manager_.get()->data();
+            else
+                return raw_data_;
+        }
+
+        T& operator*()
+        {
+            if (own_data_)
+                return *manager_;
+            else
+                return *raw_data_;
+        }
+
+      private:
+        std::unique_ptr<T> manager_{};
+        T* raw_data_{nullptr};
+        bool own_data_{false};
+    };
+    """
+    # xref: cython/Cython/Includes/libcpp/memory.pxd
+    cdef cppclass nullable_unique_ptr[T]:
+        nullable_unique_ptr()
+        nullable_unique_ptr(T*, cppbool)
+        nullable_unique_ptr(nullable_unique_ptr[T]&)
+
+        # Modifiers
+        void reset(T*, cppbool)
+        void swap(nullable_unique_ptr&)
+
+        # Observers
+        T* get()
+        T& operator*()
+        void* data()
+
+
+cdef extern from "<cuComplex.h>" nogil:
+    ctypedef struct cuComplex:
+        pass
+    ctypedef struct cuDoubleComplex:
+        pass
+
+
+ctypedef fused ResT:
+    int
+    int32_t
+    int64_t
+
+
+ctypedef fused PtrT:
+    float
+    double
+    cuComplex
+    cuDoubleComplex
+    void
+
+
+cdef cppclass nested_resource[T]:
+    nullable_unique_ptr[ vector[intptr_t] ] ptrs
+    nullable_unique_ptr[ vector[vector[T]] ] nested_resource_ptr
+
+cdef nullable_unique_ptr[ vector[ResT] ] get_resource_ptr_(object obj, ResT* __unused)
+cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 0
+cdef nullable_unique_ptr[ vector[PtrT*] ] get_resource_ptrs(object obj, PtrT* __unused)
+cdef nested_resource[ResT] get_nested_resource_ptr(object obj, ResT* __unused)
+
+cdef bint is_nested_sequence(data)
+cdef void* get_buffer_pointer(buf, Py_ssize_t size, readonly=*) except*
+
+cdef tuple get_nvjitlink_dso_version_suffix(int driver_ver)
diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pyx b/cuda_bindings/cuda/bindings/_internal/utils.pyx
new file mode 100644
index 000000000..b575ddc03
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pyx
@@ -0,0 +1,139 @@
+cimport cpython
+from libc.stdint cimport intptr_t
+from libcpp.utility cimport move
+from cython.operator cimport dereference as deref
+
+
+cdef bint is_nested_sequence(data):
+    if not cpython.PySequence_Check(data):
+        return False
+    else:
+        for i in data:
+            if not cpython.PySequence_Check(i):
+                return False
+        else:
+            return True
+
+
+cdef void* get_buffer_pointer(buf, Py_ssize_t size, readonly=True) except*:
+    """The caller must ensure ``buf`` is alive when the returned pointer is in use."""
+    cdef void* bufPtr
+    cdef int flags = cpython.PyBUF_ANY_CONTIGUOUS
+    if not readonly:
+        flags |= cpython.PyBUF_WRITABLE
+    cdef int status = -1
+    cdef cpython.Py_buffer view
+
+    if isinstance(buf, int):
+        bufPtr = <void*><intptr_t>buf
+    else:  # try buffer protocol
+        try:
+            status = cpython.PyObject_GetBuffer(buf, &view, flags)
+            assert view.len == size
+            assert view.ndim == 1
+        except Exception as e:
+            adj = "writable " if not readonly else ""
+            raise ValueError(
+                 "buf must be either a Python int representing the pointer "
+                f"address to a valid buffer, or a 1D contiguous {adj}"
+                 "buffer, of size bytes") from e
+        else:
+            bufPtr = view.buf
+        finally:
+            if status == 0:
+                cpython.PyBuffer_Release(&view)
+
+    return bufPtr
+
+
+# Cython can't infer the overload by return type alone, so we need a dummy
+# input argument to help it
+cdef nullable_unique_ptr[ vector[ResT] ] get_resource_ptr_(object obj, ResT* __unused):
+    cdef nullable_unique_ptr[ vector[ResT] ] ptr
+    cdef vector[ResT]* vec
+    if cpython.PySequence_Check(obj):
+        vec = new vector[ResT](len(obj))
+        for i in range(len(obj)):
+            deref(vec)[i] = obj[i]
+        ptr.reset(vec, True)
+    else:
+        ptr.reset(<vector[ResT]*><intptr_t>obj, False)
+    return move(ptr)
+
+cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 0:
+    cdef vector[ResT]* vec
+    if cpython.PySequence_Check(obj):
+        vec = new vector[ResT](len(obj))
+        # set the ownership immediately to avoid
+        # leaking the `vec` memory in case of exception 
+        # (e.g. ResT type range overflow)
+        # when populating the memory in the loop
+        in_out_ptr.reset(vec, True)
+        for i in range(len(obj)):
+            deref(vec)[i] = obj[i]
+    else:
+        in_out_ptr.reset(<vector[ResT]*><intptr_t>obj, False)
+    return 1
+
+
+cdef nullable_unique_ptr[ vector[PtrT*] ] get_resource_ptrs(object obj, PtrT* __unused):
+    cdef nullable_unique_ptr[ vector[PtrT*] ] ptr
+    cdef vector[PtrT*]* vec
+    if cpython.PySequence_Check(obj):
+        vec = new vector[PtrT*](len(obj))
+        for i in range(len(obj)):
+            deref(vec)[i] = <PtrT*><intptr_t>(obj[i])
+        ptr.reset(vec, True)
+    else:
+        ptr.reset(<vector[PtrT*]*><intptr_t>obj, False)
+    return move(ptr)
+
+
+cdef nested_resource[ResT] get_nested_resource_ptr(object obj, ResT* __unused):
+    cdef nested_resource[ResT] res
+    cdef nullable_unique_ptr[ vector[intptr_t] ] nested_ptr
+    cdef nullable_unique_ptr[ vector[vector[ResT]] ] nested_res_ptr
+    cdef vector[intptr_t]* nested_vec = NULL
+    cdef vector[vector[ResT]]* nested_res_vec = NULL
+    cdef size_t i = 0, length = 0
+    cdef intptr_t addr
+
+    if is_nested_sequence(obj):
+        length = len(obj)
+        nested_res_vec = new vector[vector[ResT]](length)
+        nested_vec = new vector[intptr_t](length)
+        for i, obj_i in enumerate(obj):
+            deref(nested_res_vec)[i] = obj_i
+            deref(nested_vec)[i] = <intptr_t>(deref(nested_res_vec)[i].data())
+        nested_res_ptr.reset(nested_res_vec, True)
+        nested_ptr.reset(nested_vec, True)
+    elif cpython.PySequence_Check(obj):
+        length = len(obj)
+        nested_vec = new vector[intptr_t](length)
+        for i, addr in enumerate(obj):
+            deref(nested_vec)[i] = addr
+        nested_res_ptr.reset(NULL, False)
+        nested_ptr.reset(nested_vec, True)
+    else:
+        # obj is an int (ResT**)
+        nested_res_ptr.reset(NULL, False)
+        nested_ptr.reset(<vector[intptr_t]*><intptr_t>obj, False)
+
+    res.ptrs = move(nested_ptr)
+    res.nested_resource_ptr = move(nested_res_ptr)
+    return move(res)
+
+
+class FunctionNotFoundError(RuntimeError): pass
+
+class NotSupportedError(RuntimeError): pass
+
+
+cdef tuple get_nvjitlink_dso_version_suffix(int driver_ver):
+    # applicable to both cuBLAS and cuBLASLt
+    if 11000 <= driver_ver < 12000:
+        return ('11', '')
+    elif 12000 <= driver_ver < 13000:
+        return ('12', '11', '')
+    else:
+        raise NotSupportedError('only CUDA 11/12 driver is supported')
\ No newline at end of file
diff --git a/cuda_bindings/cynvJitLink.pxd b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
similarity index 60%
rename from cuda_bindings/cynvJitLink.pxd
rename to cuda_bindings/cuda/bindings/cynvjitlink.pxd
index ed440c0b3..2913111f0 100644
--- a/cuda_bindings/cynvJitLink.pxd
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
@@ -2,10 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
 
 
-from libc.stdint cimport int64_t
+from libc.stdint cimport intptr_t, uint32_t
 
 
 ###############################################################################
@@ -13,7 +13,28 @@ from libc.stdint cimport int64_t
 ###############################################################################
 
 # enums
+ctypedef enum nvJitLinkResult "nvJitLinkResult":
+    NVJITLINK_SUCCESS "NVJITLINK_SUCCESS" = 0
+    NVJITLINK_ERROR_UNRECOGNIZED_OPTION "NVJITLINK_ERROR_UNRECOGNIZED_OPTION"
+    NVJITLINK_ERROR_MISSING_ARCH "NVJITLINK_ERROR_MISSING_ARCH"
+    NVJITLINK_ERROR_INVALID_INPUT "NVJITLINK_ERROR_INVALID_INPUT"
+    NVJITLINK_ERROR_PTX_COMPILE "NVJITLINK_ERROR_PTX_COMPILE"
+    NVJITLINK_ERROR_NVVM_COMPILE "NVJITLINK_ERROR_NVVM_COMPILE"
+    NVJITLINK_ERROR_INTERNAL "NVJITLINK_ERROR_INTERNAL"
+    NVJITLINK_ERROR_THREADPOOL "NVJITLINK_ERROR_THREADPOOL"
+    NVJITLINK_ERROR_UNRECOGNIZED_INPUT "NVJITLINK_ERROR_UNRECOGNIZED_INPUT"
+    NVJITLINK_ERROR_FINALIZE "NVJITLINK_ERROR_FINALIZE"
 
+ctypedef enum nvJitLinkInputType "nvJitLinkInputType":
+    NVJITLINK_INPUT_NONE "NVJITLINK_INPUT_NONE" = 0
+    NVJITLINK_INPUT_CUBIN "NVJITLINK_INPUT_CUBIN" = 1
+    NVJITLINK_INPUT_PTX "NVJITLINK_INPUT_PTX"
+    NVJITLINK_INPUT_LTOIR "NVJITLINK_INPUT_LTOIR"
+    NVJITLINK_INPUT_FATBIN "NVJITLINK_INPUT_FATBIN"
+    NVJITLINK_INPUT_OBJECT "NVJITLINK_INPUT_OBJECT"
+    NVJITLINK_INPUT_LIBRARY "NVJITLINK_INPUT_LIBRARY"
+    NVJITLINK_INPUT_INDEX "NVJITLINK_INPUT_INDEX"
+    NVJITLINK_INPUT_ANY "NVJITLINK_INPUT_ANY" = 10
 
 
 # types
@@ -26,7 +47,7 @@ cdef extern from *:
     ctypedef void* cudaStream_t 'cudaStream_t'
 
 
-
+ctypedef void* nvJitLinkHandle 'nvJitLinkHandle'
 
 
 ###############################################################################
@@ -45,4 +66,4 @@ cdef nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) ex
 cdef nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
 cdef nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
 cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
+cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
\ No newline at end of file
diff --git a/cuda_bindings/cynvJitLink.pyx b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
similarity index 66%
rename from cuda_bindings/cynvJitLink.pyx
rename to cuda_bindings/cuda/bindings/cynvjitlink.pyx
index 65d3f9840..a6703cc0f 100644
--- a/cuda_bindings/cynvJitLink.pyx
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
 
-from ._internal cimport nvJitLink as _nvJitLink
+from ._internal cimport nvjitlink as _nvjitlink
 
 
 ###############################################################################
@@ -12,52 +12,52 @@ from ._internal cimport nvJitLink as _nvJitLink
 ###############################################################################
 
 cdef nvJitLinkResult nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
-    return _nvJitLink._nvJitLinkCreate(handle, numOptions, options)
+    return _nvjitlink._nvJitLinkCreate(handle, numOptions, options)
 
 
 cdef nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
-    return _nvJitLink._nvJitLinkDestroy(handle)
+    return _nvjitlink._nvJitLinkDestroy(handle)
 
 
 cdef nvJitLinkResult nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
-    return _nvJitLink._nvJitLinkAddData(handle, inputType, data, size, name)
+    return _nvjitlink._nvJitLinkAddData(handle, inputType, data, size, name)
 
 
 cdef nvJitLinkResult nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
-    return _nvJitLink._nvJitLinkAddFile(handle, inputType, fileName)
+    return _nvjitlink._nvJitLinkAddFile(handle, inputType, fileName)
 
 
 cdef nvJitLinkResult nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
-    return _nvJitLink._nvJitLinkComplete(handle)
+    return _nvjitlink._nvJitLinkComplete(handle)
 
 
 cdef nvJitLinkResult nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    return _nvJitLink._nvJitLinkGetLinkedCubinSize(handle, size)
+    return _nvjitlink._nvJitLinkGetLinkedCubinSize(handle, size)
 
 
 cdef nvJitLinkResult nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
-    return _nvJitLink._nvJitLinkGetLinkedCubin(handle, cubin)
+    return _nvjitlink._nvJitLinkGetLinkedCubin(handle, cubin)
 
 
 cdef nvJitLinkResult nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    return _nvJitLink._nvJitLinkGetLinkedPtxSize(handle, size)
+    return _nvjitlink._nvJitLinkGetLinkedPtxSize(handle, size)
 
 
 cdef nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
-    return _nvJitLink._nvJitLinkGetLinkedPtx(handle, ptx)
+    return _nvjitlink._nvJitLinkGetLinkedPtx(handle, ptx)
 
 
 cdef nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    return _nvJitLink._nvJitLinkGetErrorLogSize(handle, size)
+    return _nvjitlink._nvJitLinkGetErrorLogSize(handle, size)
 
 
 cdef nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
-    return _nvJitLink._nvJitLinkGetErrorLog(handle, log)
+    return _nvjitlink._nvJitLinkGetErrorLog(handle, log)
 
 
 cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    return _nvJitLink._nvJitLinkGetInfoLogSize(handle, size)
+    return _nvjitlink._nvJitLinkGetInfoLogSize(handle, size)
 
 
 cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
-    return _nvJitLink._nvJitLinkGetInfoLog(handle, log)
+    return _nvjitlink._nvJitLinkGetInfoLog(handle, log)
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pxd b/cuda_bindings/cuda/bindings/nvjitlink.pxd
new file mode 100644
index 000000000..de4d46170
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pxd
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
+
+from libc.stdint cimport intptr_t, uint32_t
+
+from .cynvjitlink cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+ctypedef nvJitLinkHandle Handle
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+ctypedef nvJitLinkResult _NvJitLinkResult
+ctypedef nvJitLinkInputType _NvJitLinkInputType
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef create(intptr_t handle, uint32_t num_options, intptr_t options)
+cpdef destroy(intptr_t handle)
+cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, intptr_t name)
+cpdef add_file(intptr_t handle, int input_type, intptr_t file_name)
+cpdef complete(intptr_t handle)
+cpdef get_linked_cubin_size(intptr_t handle, intptr_t size)
+cpdef get_linked_cubin(intptr_t handle, intptr_t cubin)
+cpdef get_linked_ptx_size(intptr_t handle, intptr_t size)
+cpdef get_linked_ptx(intptr_t handle, intptr_t ptx)
+cpdef get_error_log_size(intptr_t handle, intptr_t size)
+cpdef get_error_log(intptr_t handle, intptr_t log)
+cpdef get_info_log_size(intptr_t handle, intptr_t size)
+cpdef get_info_log(intptr_t handle, intptr_t log)
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
new file mode 100644
index 000000000..8c1a89976
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -0,0 +1,153 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
+
+cimport cython  # NOQA
+
+from enum import IntEnum as _IntEnum
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+class NvJitLinkResult(_IntEnum):
+    """See `nvJitLinkResult`."""
+    SUCCESS = NVJITLINK_SUCCESS
+    ERROR_UNRECOGNIZED_OPTION = NVJITLINK_ERROR_UNRECOGNIZED_OPTION
+    ERROR_MISSING_ARCH = NVJITLINK_ERROR_MISSING_ARCH
+    ERROR_INVALID_INPUT = NVJITLINK_ERROR_INVALID_INPUT
+    ERROR_PTX_COMPILE = NVJITLINK_ERROR_PTX_COMPILE
+    ERROR_NVVM_COMPILE = NVJITLINK_ERROR_NVVM_COMPILE
+    ERROR_INTERNAL = NVJITLINK_ERROR_INTERNAL
+    ERROR_THREADPOOL = NVJITLINK_ERROR_THREADPOOL
+    ERROR_UNRECOGNIZED_INPUT = NVJITLINK_ERROR_UNRECOGNIZED_INPUT
+    ERROR_FINALIZE = NVJITLINK_ERROR_FINALIZE
+
+class NvJitLinkInputType(_IntEnum):
+    """See `nvJitLinkInputType`."""
+    INPUT_NONE = NVJITLINK_INPUT_NONE
+    INPUT_CUBIN = NVJITLINK_INPUT_CUBIN
+    INPUT_PTX = NVJITLINK_INPUT_PTX
+    INPUT_LTOIR = NVJITLINK_INPUT_LTOIR
+    INPUT_FATBIN = NVJITLINK_INPUT_FATBIN
+    INPUT_OBJECT = NVJITLINK_INPUT_OBJECT
+    INPUT_LIBRARY = NVJITLINK_INPUT_LIBRARY
+    INPUT_INDEX = NVJITLINK_INPUT_INDEX
+    INPUT_ANY = NVJITLINK_INPUT_ANY
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+cdef dict STATUS={
+    NVJITLINK_SUCCESS                   : 'NVJITLINK_SUCCESS',
+    NVJITLINK_ERROR_UNRECOGNIZED_OPTION : 'NVJITLINK_ERROR_UNRECOGNIZED_OPTION',
+    NVJITLINK_ERROR_MISSING_ARCH        : 'NVJITLINK_ERROR_MISSING_ARCH',
+    NVJITLINK_ERROR_INVALID_INPUT       : 'NVJITLINK_ERROR_INVALID_INPUT',
+    NVJITLINK_ERROR_PTX_COMPILE         : 'NVJITLINK_ERROR_PTX_COMPILE',
+    NVJITLINK_ERROR_NVVM_COMPILE        : 'NVJITLINK_ERROR_NVVM_COMPILE',
+    NVJITLINK_ERROR_INTERNAL            : 'NVJITLINK_ERROR_INTERNAL'
+}
+
+class nvJitLinkError(Exception):
+
+    def __init__(self, status):
+        self.status = status
+        cdef str err = STATUS[status]
+        super(nvJitLinkError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cdef inline void check_status(int status) nogil:
+    if status != 0:
+        with gil:
+            raise nvJitLinkError(status)
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef create(intptr_t handle, uint32_t num_options, intptr_t options):
+    with nogil:
+        status = nvJitLinkCreate(<Handle*>handle, num_options, <const char**>options)
+    check_status(status)
+
+
+cpdef destroy(intptr_t handle):
+    with nogil:
+        status = nvJitLinkDestroy(<Handle*>handle)
+    check_status(status)
+
+
+cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, intptr_t name):
+    with nogil:
+        status = nvJitLinkAddData(<Handle>handle, <_NvJitLinkInputType>input_type, <const void*>data, size, <const char*>name)
+    check_status(status)
+
+
+cpdef add_file(intptr_t handle, int input_type, intptr_t file_name):
+    with nogil:
+        status = nvJitLinkAddFile(<Handle>handle, <_NvJitLinkInputType>input_type, <const char*>file_name)
+    check_status(status)
+
+
+cpdef complete(intptr_t handle):
+    with nogil:
+        status = nvJitLinkComplete(<Handle>handle)
+    check_status(status)
+
+
+cpdef get_linked_cubin_size(intptr_t handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetLinkedCubinSize(<Handle>handle, <size_t*>size)
+    check_status(status)
+
+
+cpdef get_linked_cubin(intptr_t handle, intptr_t cubin):
+    with nogil:
+        status = nvJitLinkGetLinkedCubin(<Handle>handle, <void*>cubin)
+    check_status(status)
+
+
+cpdef get_linked_ptx_size(intptr_t handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetLinkedPtxSize(<Handle>handle, <size_t*>size)
+    check_status(status)
+
+
+cpdef get_linked_ptx(intptr_t handle, intptr_t ptx):
+    with nogil:
+        status = nvJitLinkGetLinkedPtx(<Handle>handle, <char*>ptx)
+    check_status(status)
+
+
+cpdef get_error_log_size(intptr_t handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetErrorLogSize(<Handle>handle, <size_t*>size)
+    check_status(status)
+
+
+cpdef get_error_log(intptr_t handle, intptr_t log):
+    with nogil:
+        status = nvJitLinkGetErrorLog(<Handle>handle, <char*>log)
+    check_status(status)
+
+
+cpdef get_info_log_size(intptr_t handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetInfoLogSize(<Handle>handle, <size_t*>size)
+    check_status(status)
+
+
+cpdef get_info_log(intptr_t handle, intptr_t log):
+    with nogil:
+        status = nvJitLinkGetInfoLog(<Handle>handle, <char*>log)
+    check_status(status)
\ No newline at end of file
diff --git a/cuda_bindings/nvJitLink.pxd b/cuda_bindings/nvJitLink.pxd
deleted file mode 100644
index d063002be..000000000
--- a/cuda_bindings/nvJitLink.pxd
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
-
-from libc.stdint cimport intptr_t
-
-from .cynvJitLink cimport *
-
-
-###############################################################################
-# Types
-###############################################################################
-
-
-
-ctypedef cudaStream_t Stream
-ctypedef cudaDataType DataType
-ctypedef libraryPropertyType_t LibraryPropertyType
-
-
-###############################################################################
-# Enum
-###############################################################################
-
-
-
-
-###############################################################################
-# Functions
-###############################################################################
-
-cpdef create(intptr_t handle, uint32_t num_options, intptr_t options)
-cpdef destroy(intptr_t handle)
-cpdef add_data(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t data, size_t size, intptr_t name)
-cpdef add_file(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t file_name)
-cpdef complete(nvJitLinkHandle handle)
-cpdef get_linked_cubin_size(nvJitLinkHandle handle, intptr_t size)
-cpdef get_linked_cubin(nvJitLinkHandle handle, intptr_t cubin)
-cpdef get_linked_ptx_size(nvJitLinkHandle handle, intptr_t size)
-cpdef get_linked_ptx(nvJitLinkHandle handle, intptr_t ptx)
-cpdef get_error_log_size(nvJitLinkHandle handle, intptr_t size)
-cpdef get_error_log(nvJitLinkHandle handle, intptr_t log)
-cpdef get_info_log_size(nvJitLinkHandle handle, intptr_t size)
-cpdef get_info_log(nvJitLinkHandle handle, intptr_t log)
diff --git a/cuda_bindings/nvJitLink.pyx b/cuda_bindings/nvJitLink.pyx
deleted file mode 100644
index 18f4c7545..000000000
--- a/cuda_bindings/nvJitLink.pyx
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
-
-cimport cython  # NOQA
-
-from enum import IntEnum as _IntEnum
-
-
-###############################################################################
-# Enum
-###############################################################################
-
-
-
-
-###############################################################################
-# Error handling
-###############################################################################
-
-cdef dict STATUS={
-    NVJITLINK_SUCCESS                   : 'NVJITLINK_SUCCESS',
-    NVJITLINK_ERROR_UNRECOGNIZED_OPTION : 'NVJITLINK_ERROR_UNRECOGNIZED_OPTION',
-    NVJITLINK_ERROR_MISSING_ARCH        : 'NVJITLINK_ERROR_MISSING_ARCH', // -arch=sm_NN option not specified
-    NVJITLINK_ERROR_INVALID_INPUT       : 'NVJITLINK_ERROR_INVALID_INPUT',
-    NVJITLINK_ERROR_PTX_COMPILE         : 'NVJITLINK_ERROR_PTX_COMPILE',
-    NVJITLINK_ERROR_NVVM_COMPILE        : 'NVJITLINK_ERROR_NVVM_COMPILE',
-    NVJITLINK_ERROR_INTERNAL            : 'NVJITLINK_ERROR_INTERNAL',
-    NVJITLINK_ERROR_THREADPOOL          : 'NVJITLINK_ERROR_THREADPOOL',
-    NVJITLINK_ERROR_UNRECOGNIZED_INPUT  : 'NVJITLINK_ERROR_UNRECOGNIZED_INPUT',
-    NVJITLINK_ERROR_NULL_INPUT          : 'NVJITLINK_ERROR_NULL_INPUT',
-    NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS: 'NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS',
-    NVJITLINK_ERROR_INCORRECT_INPUT_TYPE: 'NVJITLINK_ERROR_INCORRECT_INPUT_TYPE',
-    NVJITLINK_ERROR_ARCH_MISMATCH       : 'NVJITLINK_ERROR_ARCH_MISMATCH',
-    NVJITLINK_ERROR_OUTDATED_LIBRARY    : 'NVJITLINK_ERROR_OUTDATED_LIBRARY',
-    NVJITLINK_ERROR_MISSING_FATBIN      : 'NVJITLINK_ERROR_MISSING_FATBIN'
-}
-
-class nvJitLinkError(Exception):
-
-    def __init__(self, status):
-        self.status = status
-        cdef str err = STATUS[status]
-        super(nvJitLinkError, self).__init__(err)
-
-    def __reduce__(self):
-        return (type(self), (self.status,))
-
-
-@cython.profile(False)
-cdef inline void check_status(int status) nogil:
-    if status != 0:
-        with gil:
-            raise nvJitLinkError(status)
-
-
-###############################################################################
-# Wrapper functions
-###############################################################################
-
-cpdef create(intptr_t handle, uint32_t num_options, intptr_t options):
-    with nogil:
-        status = nvJitLinkCreate(<nvJitLinkHandle*>handle, num_options, <const char**>options)
-        _check_status(status)
-
-
-cpdef destroy(intptr_t handle):
-    with nogil:
-        status = nvJitLinkDestroy(<nvJitLinkHandle*>handle)
-        _check_status(status)
-
-
-cpdef add_data(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t data, size_t size, intptr_t name):
-    with nogil:
-        status = nvJitLinkAddData(handle, input_type, <const void*>data, size, <const char*>name)
-        _check_status(status)
-
-
-cpdef add_file(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t file_name):
-    with nogil:
-        status = nvJitLinkAddFile(handle, input_type, <const char*>file_name)
-        _check_status(status)
-
-
-cpdef complete(nvJitLinkHandle handle):
-    with nogil:
-        status = nvJitLinkComplete(handle)
-        _check_status(status)
-
-
-cpdef get_linked_cubin_size(nvJitLinkHandle handle, intptr_t size):
-    with nogil:
-        status = nvJitLinkGetLinkedCubinSize(handle, <size_t*>size)
-        _check_status(status)
-
-
-cpdef get_linked_cubin(nvJitLinkHandle handle, intptr_t cubin):
-    with nogil:
-        status = nvJitLinkGetLinkedCubin(handle, <void*>cubin)
-        _check_status(status)
-
-
-cpdef get_linked_ptx_size(nvJitLinkHandle handle, intptr_t size):
-    with nogil:
-        status = nvJitLinkGetLinkedPtxSize(handle, <size_t*>size)
-        _check_status(status)
-
-
-cpdef get_linked_ptx(nvJitLinkHandle handle, intptr_t ptx):
-    with nogil:
-        status = nvJitLinkGetLinkedPtx(handle, <char*>ptx)
-        _check_status(status)
-
-
-cpdef get_error_log_size(nvJitLinkHandle handle, intptr_t size):
-    with nogil:
-        status = nvJitLinkGetErrorLogSize(handle, <size_t*>size)
-        _check_status(status)
-
-
-cpdef get_error_log(nvJitLinkHandle handle, intptr_t log):
-    with nogil:
-        status = nvJitLinkGetErrorLog(handle, <char*>log)
-        _check_status(status)
-
-
-cpdef get_info_log_size(nvJitLinkHandle handle, intptr_t size):
-    with nogil:
-        status = nvJitLinkGetInfoLogSize(handle, <size_t*>size)
-        _check_status(status)
-
-
-cpdef get_info_log(nvJitLinkHandle handle, intptr_t log):
-    with nogil:
-        status = nvJitLinkGetInfoLog(handle, <char*>log)
-        _check_status(status)
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 27b83f946..d7baf018b 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -11,6 +11,7 @@
 import platform
 import sys
 import sysconfig
+import atexit
 
 from Cython import Tempita
 from Cython.Build import cythonize
@@ -19,6 +20,8 @@
 from setuptools.extension import Extension
 from setuptools.command.build_ext import build_ext
 import versioneer
+import tempfile
+import shutil
 
 
 # ----------------------------------------------------------------------
@@ -90,16 +93,13 @@
                 break
         if not os.path.exists(path):
             print(f'Missing header {header}')
-
     print(f'Parsing {library} headers')
     parser = CParser(header_paths,
                      cache='./cache_{}'.format(library.split('.')[0]) if PARSER_CACHING else None,
                      replace=replace)
-
     if library == 'driver':
         CUDA_VERSION = parser.defs['macros']['CUDA_VERSION'] if 'CUDA_VERSION' in parser.defs['macros'] else 'Unknown'
         print(f'Found CUDA_VERSION: {CUDA_VERSION}')
-
     # Combine types with others since they sometimes get tangled
     found_types += {key for key in parser.defs['types']}
     found_types += {key for key in parser.defs['structs']}
@@ -109,16 +109,13 @@
     found_types += {key for key in parser.defs['enums']}
     found_functions += {key for key in parser.defs['functions']}
     found_values += {key for key in parser.defs['values']}
-
 if len(found_functions) == 0:
     raise RuntimeError(f'Parser found no functions. Is CUDA_HOME setup correctly? (CUDA_HOME="{CUDA_HOME}")')
-
 # Unwrap struct and union members
 def unwrapMembers(found_dict):
     for key in found_dict:
         members = [var for var, _, _ in found_dict[key]['members']]
         found_dict[key]['members'] = members
-
 unwrapMembers(found_structs)
 unwrapMembers(found_unions)
 
@@ -148,7 +145,9 @@ def generate_output(infile, local):
              os.path.join('cuda', 'bindings'),
              os.path.join('cuda', 'bindings', '_bindings'),
              os.path.join('cuda', 'bindings', '_lib'),
-             os.path.join('cuda', 'bindings', '_lib', 'cyruntime')]
+             os.path.join('cuda', 'bindings', '_lib', 'cyruntime'),
+             os.path.join('cuda', 'bindings', '_internal'),
+            ]
 input_files = []
 for path in path_list:
     input_files += fetch_input_files(path)
@@ -183,6 +182,7 @@ def generate_output(infile, local):
 
 # For Setup
 extensions = []
+new_extensions = []
 cmdclass = {}
 
 # ----------------------------------------------------------------------
@@ -192,6 +192,7 @@ def prep_extensions(sources):
     pattern = sources[0]
     files = glob.glob(pattern)
     exts = []
+    print(include_dirs, library_dirs)
     for pyx in files:
         mod_name = pyx.replace(".pyx", "").replace(os.sep, ".").replace("/", ".")
         exts.append(
@@ -208,6 +209,34 @@ def prep_extensions(sources):
         )
     return exts
 
+# new path for the bindings from cybind
+def rename_architecture_specific_files():
+    if sys.platform == 'linux':
+        src_files = glob.glob('cuda/bindings/_internal/*_linux.pyx')
+    elif sys.platform == 'win32':
+        src_files = glob.glob('cuda/bindings/_internal/*_windows.pyx')
+    else:
+        raise RuntimeError(f'platform is unrecognized: {sys.platform}')
+    dst_files = []
+    for src in src_files:
+        # Set up a temporary file; it must be under the cache directory so
+        # that atomic moves within the same filesystem can be guaranteed
+        with tempfile.NamedTemporaryFile(delete=False, dir='.') as f:
+            shutil.copy2(src, f.name)
+            f_name = f.name
+        dst = src.replace('_linux', '').replace('_windows', '')
+        # atomic move with the destination guaranteed to be overwritten
+        os.replace(f_name, f"./{dst}")
+        dst_files.append(dst)
+
+@atexit.register
+def cleanup_dst_files():
+    pass
+    # for dst in sources_list:
+    #     try:
+    #         os.remove(dst)
+    #     except FileNotFoundError:
+    #         pass
 
 def do_cythonize(extensions):
     return cythonize(
@@ -231,11 +260,20 @@ def do_cythonize(extensions):
     ["cuda/*.pyx"],
     # tests
     ["tests/*.pyx"],
+    # interal files used by cybind
+    ['cuda/bindings/_internal/*.pyx'],
 ]
 
+
+rename_architecture_specific_files()
+
 for sources in sources_list:
     extensions += prep_extensions(sources)
 
+# for sources in new_sources_list:
+#     new_extensions += prep_extensions(sources)
+
+
 # ---------------------------------------------------------------------
 # Custom build_ext command
 # Files are build in two steps:
@@ -258,14 +296,20 @@ def finalize_options(self):
 # ----------------------------------------------------------------------
 # Setup
 
+package_data=dict.fromkeys(
+        find_packages(include=["cuda.cuda", "cuda.cuda.*", "cuda.cuda.bindings", "cuda.cuda.bindings._bindings", "cuda.cuda.bindings._lib", "cuda.cuda.bindings._lib.cyruntime", "cuda.cuda.bindings._internal", "tests"]),
+        ["*.pxd", "*.pyx", "*.py", "*.h", "*.cpp"],
+    )
+
 setup(
     version=versioneer.get_version(),
     ext_modules=do_cythonize(extensions),
-    packages=find_packages(include=["cuda.cuda", "cuda.cuda.*", "cuda.cuda.bindings", "cuda.cuda.bindings._bindings", "cuda.cuda.bindings._lib", "cuda.cuda.bindings._lib.cyruntime", "tests"]),
+    packages=find_packages(include=["cuda.cuda", "cuda.cuda.*", "cuda.cuda.bindings", "cuda.cuda.bindings._bindings", "cuda.cuda.bindings._lib", "cuda.cuda.bindings._lib.cyruntime", "cuda.cuda.bindings._internal", "tests"]),
     package_data=dict.fromkeys(
-        find_packages(include=["cuda.cuda", "cuda.cuda.*", "cuda.cuda.bindings", "cuda.cuda.bindings._bindings", "cuda.cuda.bindings._lib", "cuda.cuda.bindings._lib.cyruntime", "tests"]),
+        find_packages(include=["cuda.cuda", "cuda.cuda.*", "cuda.cuda.bindings", "cuda.cuda.bindings._bindings", "cuda.cuda.bindings._lib", "cuda.cuda.bindings._lib.cyruntime", "cuda.cuda.bindings._internal", "tests"]),
         ["*.pxd", "*.pyx", "*.py", "*.h", "*.cpp"],
     ),
+    
     cmdclass=cmdclass,
     zip_safe=False,
-)
+)
\ No newline at end of file
diff --git a/cuda_bindings/tests/test_nvJitLink.py b/cuda_bindings/tests/test_nvjitlink.py
similarity index 62%
rename from cuda_bindings/tests/test_nvJitLink.py
rename to cuda_bindings/tests/test_nvjitlink.py
index f566ae7c6..37129e4a2 100644
--- a/cuda_bindings/tests/test_nvJitLink.py
+++ b/cuda_bindings/tests/test_nvjitlink.py
@@ -1,44 +1,46 @@
 import pytest
-from cuda import nvJitLink
+from cuda.bindings import nvjitlink
+
+dir(nvjitlink)
 
 def test_create_no_arch_error():
     # nvjitlink expects at least the architecture to be specified.
     with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_MISSING_ARCH error"):
-        nvJitLink.create()
+        nvjitlink.create()
 
 
 def test_invalid_arch_error():
     # sm_XX is not a valid architecture
     with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
-        nvJitLink.create("-arch=sm_XX")
+        nvjitlink.create("-arch=sm_XX")
 
 
 def test_unrecognized_option_error():
     with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
-        nvJitLink.create("-fictitious_option")
+        nvjitlink.create("-fictitious_option")
 
 
 def test_invalid_option_type_error():
     with pytest.raises(TypeError, match="Expecting only strings"):
-        nvJitLink.create("-arch", 53)
+        nvjitlink.create("-arch", 53)
 
 
 def test_create_and_destroy():
-    handle = nvJitLink.create("-arch=sm_53")
+    handle = nvjitlink.create("-arch=sm_53")
     assert handle != 0
-    nvJitLink.destroy(handle)
+    nvjitlink.destroy(handle)
 
 
 def test_complete_empty():
-    handle = nvJitLink.create("-arch=sm_75")
-    nvJitLink.complete(handle)
-    nvJitLink.destroy(handle)
+    handle = nvjitlink.create("-arch=sm_75")
+    nvjitlink.complete(handle)
+    nvjitlink.destroy(handle)
 
 
 @pytest.mark.parametrize(
     "input_file,input_type",
     [
-        ("device_functions_cubin", nvJitLink.InputType.CUBIN),
+        ("device_functions_cubin", nvjitlink.InputType.CUBIN),
         ("device_functions_fatbin", InputType.FATBIN),
         ("device_functions_ptx", InputType.PTX),
         ("device_functions_object", InputType.OBJECT),
@@ -48,9 +50,9 @@ def test_complete_empty():
 def test_add_file(input_file, input_type, gpu_arch_flag, request):
     filename, data = request.getfixturevalue(input_file)
 
-    handle = nvJitLink.create(gpu_arch_flag)
-    nvJitLink.add_data(handle, input_type.value, data, filename)
-    nvJitLink.destroy(handle)
+    handle = nvjitlink.create(gpu_arch_flag)
+    nvjitlink.add_data(handle, input_type.value, data, filename)
+    nvjitlink.destroy(handle)
 
 
 # We test the LTO input case separately as it requires the `-lto` flag. The
@@ -59,20 +61,20 @@ def test_add_file(input_file, input_type, gpu_arch_flag, request):
 def test_add_file_lto(device_functions_ltoir_object, gpu_arch_flag):
     filename, data = device_functions_ltoir_object
 
-    handle = nvJitLink.create(gpu_arch_flag, "-lto")
-    nvJitLink.add_data(handle, InputType.OBJECT.value, data, filename)
-    nvJitLink.destroy(handle)
+    handle = nvjitlink.create(gpu_arch_flag, "-lto")
+    nvjitlink.add_data(handle, InputType.OBJECT.value, data, filename)
+    nvjitlink.destroy(handle)
 
 
 def test_get_error_log(undefined_extern_cubin, gpu_arch_flag):
-    handle = nvJitLink.create(gpu_arch_flag)
+    handle = nvjitlink.create(gpu_arch_flag)
     filename, data = undefined_extern_cubin
     input_type = InputType.CUBIN.value
-    nvJitLink.add_data(handle, input_type, data, filename)
+    nvjitlink.add_data(handle, input_type, data, filename)
     with pytest.raises(RuntimeError):
-        nvJitLink.complete(handle)
-    error_log = nvJitLink.get_error_log(handle)
-    nvJitLink.destroy(handle)
+        nvjitlink.complete(handle)
+    error_log = nvjitlink.get_error_log(handle)
+    nvjitlink.destroy(handle)
     assert (
         "Undefined reference to '_Z5undefff' "
         "in 'undefined_extern.cubin'" in error_log
@@ -80,25 +82,25 @@ def test_get_error_log(undefined_extern_cubin, gpu_arch_flag):
 
 
 def test_get_info_log(device_functions_cubin, gpu_arch_flag):
-    handle = nvJitLink.create(gpu_arch_flag)
+    handle = nvjitlink.create(gpu_arch_flag)
     filename, data = device_functions_cubin
     input_type = InputType.CUBIN.value
-    nvJitLink.add_data(handle, input_type, data, filename)
-    nvJitLink.complete(handle)
-    info_log = nvJitLink.get_info_log(handle)
-    nvJitLink.destroy(handle)
+    nvjitlink.add_data(handle, input_type, data, filename)
+    nvjitlink.complete(handle)
+    info_log = nvjitlink.get_info_log(handle)
+    nvjitlink.destroy(handle)
     # Info log is empty
     assert "" == info_log
 
 
 def test_get_linked_cubin(device_functions_cubin, gpu_arch_flag):
-    handle = nvJitLink.create(gpu_arch_flag)
+    handle = nvjitlink.create(gpu_arch_flag)
     filename, data = device_functions_cubin
     input_type = InputType.CUBIN.value
-    nvJitLink.add_data(handle, input_type, data, filename)
-    nvJitLink.complete(handle)
-    cubin = nvJitLink.get_linked_cubin(handle)
-    nvJitLink.destroy(handle)
+    nvjitlink.add_data(handle, input_type, data, filename)
+    nvjitlink.complete(handle)
+    cubin = nvjitlink.get_linked_cubin(handle)
+    nvjitlink.destroy(handle)
 
     # Just check we got something that looks like an ELF
     assert cubin[:4] == b"\x7fELF"
@@ -107,13 +109,13 @@ def test_get_linked_cubin(device_functions_cubin, gpu_arch_flag):
 def test_get_linked_cubin_link_not_complete_error(
     device_functions_cubin, gpu_arch_flag
 ):
-    handle = nvJitLink.create(gpu_arch_flag)
+    handle = nvjitlink.create(gpu_arch_flag)
     filename, data = device_functions_cubin
     input_type = InputType.CUBIN.value
-    nvJitLink.add_data(handle, input_type, data, filename)
+    nvjitlink.add_data(handle, input_type, data, filename)
     with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
-        nvJitLink.get_linked_cubin(handle)
-    nvJitLink.destroy(handle)
+        nvjitlink.get_linked_cubin(handle)
+    nvjitlink.destroy(handle)
 
 
 def test_get_linked_cubin_from_lto(device_functions_ltoir_object, gpu_arch_flag):
@@ -123,11 +125,11 @@ def test_get_linked_cubin_from_lto(device_functions_ltoir_object, gpu_arch_flag)
     # LTO is requested. So we need to use the OBJECT input type, and the linker
     # retrieves the LTO IR from it because we passed the -lto flag.
     input_type = InputType.OBJECT.value
-    handle = nvJitLink.create(gpu_arch_flag, "-lto")
-    nvJitLink.add_data(handle, input_type, data, filename)
-    nvJitLink.complete(handle)
-    cubin = nvJitLink.get_linked_cubin(handle)
-    nvJitLink.destroy(handle)
+    handle = nvjitlink.create(gpu_arch_flag, "-lto")
+    nvjitlink.add_data(handle, input_type, data, filename)
+    nvjitlink.complete(handle)
+    cubin = nvjitlink.get_linked_cubin(handle)
+    nvjitlink.destroy(handle)
 
     # Just check we got something that looks like an ELF
     assert cubin[:4] == b"\x7fELF"
@@ -140,23 +142,23 @@ def test_get_linked_ptx_from_lto(device_functions_ltoir_object, gpu_arch_flag):
     # LTO is requested. So we need to use the OBJECT input type, and the linker
     # retrieves the LTO IR from it because we passed the -lto flag.
     input_type = InputType.OBJECT.value
-    handle = nvJitLink.create(gpu_arch_flag, "-lto", "-ptx")
-    nvJitLink.add_data(handle, input_type, data, filename)
-    nvJitLink.complete(handle)
-    nvJitLink.get_linked_ptx(handle)
-    nvJitLink.destroy(handle)
+    handle = nvjitlink.create(gpu_arch_flag, "-lto", "-ptx")
+    nvjitlink.add_data(handle, input_type, data, filename)
+    nvjitlink.complete(handle)
+    nvjitlink.get_linked_ptx(handle)
+    nvjitlink.destroy(handle)
 
 
 def test_get_linked_ptx_link_not_complete_error(
     device_functions_ltoir_object, gpu_arch_flag
 ):
-    handle = nvJitLink.create(gpu_arch_flag, "-lto", "-ptx")
+    handle = nvjitlink.create(gpu_arch_flag, "-lto", "-ptx")
     filename, data = device_functions_ltoir_object
     input_type = InputType.OBJECT.value
-    nvJitLink.add_data(handle, input_type, data, filename)
+    nvjitlink.add_data(handle, input_type, data, filename)
     with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
-        nvJitLink.get_linked_ptx(handle)
-    nvJitLink.destroy(handle)
+        nvjitlink.get_linked_ptx(handle)
+    nvjitlink.destroy(handle)
 
 
 def test_package_version():

From 8c4029f5cf5f7f8f9cdd79eef2b22a19fc2d07cd Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 16 Oct 2024 16:30:19 -0700
Subject: [PATCH 06/34] working

---
 .../bindings/_internal/nvjitlink_windows.pyx  |  4 +--
 cuda_bindings/cuda/bindings/cynvjitlink.pxd   |  2 +-
 cuda_bindings/cuda/bindings/cynvjitlink.pyx   |  1 +
 cuda_bindings/setup.py                        | 36 +++++++++----------
 4 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index 43852441e..5cac180f3 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -6,14 +6,14 @@
 
 from libc.stdint cimport intptr_t
 
-from .utils cimport get_nvjitlink_dso_version_suffix
+from utils cimport get_nvjitlink_dso_version_suffix
 
 import os
 import site
 
 import win32api
 
-from .utils import FunctionNotFoundError, NotSupportedError
+from utils import FunctionNotFoundError, NotSupportedError
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pxd b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
index 2913111f0..3dcc1d4ec 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
@@ -5,7 +5,7 @@
 # This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
 
 
-from libc.stdint cimport intptr_t, uint32_t
+from libc.stdint cimport uint32_t
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pyx b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
index a6703cc0f..5e882524e 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
@@ -5,6 +5,7 @@
 # This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
 
 from ._internal cimport nvjitlink as _nvjitlink
+from libc.stdint cimport uint32_t
 
 
 ###############################################################################
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index d7baf018b..8ffb50d63 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -210,11 +210,11 @@ def prep_extensions(sources):
     return exts
 
 # new path for the bindings from cybind
-def rename_architecture_specific_files():
+def rename_architecture_specific_files(path):
     if sys.platform == 'linux':
-        src_files = glob.glob('cuda/bindings/_internal/*_linux.pyx')
+        src_files = glob.glob(os.path.join(path, '*_linux.pyx'))
     elif sys.platform == 'win32':
-        src_files = glob.glob('cuda/bindings/_internal/*_windows.pyx')
+        src_files = glob.glob(os.path.join(path, '*_windows.pyx'))
     else:
         raise RuntimeError(f'platform is unrecognized: {sys.platform}')
     dst_files = []
@@ -232,11 +232,13 @@ def rename_architecture_specific_files():
 @atexit.register
 def cleanup_dst_files():
     pass
-    # for dst in sources_list:
-    #     try:
-    #         os.remove(dst)
-    #     except FileNotFoundError:
-    #         pass
+    for dst in architechture_specific_files_dir:
+        try:
+            os.remove(dst)
+        except FileNotFoundError:
+            pass
+        
+architechture_specific_files_dir = 'cuda/bindings/_internal/'
 
 def do_cythonize(extensions):
     return cythonize(
@@ -247,6 +249,7 @@ def do_cythonize(extensions):
         ),
         **extra_cythonize_kwargs)
 
+rename_architecture_specific_files(architechture_specific_files_dir)
 
 sources_list = [
     # private
@@ -260,20 +263,18 @@ def do_cythonize(extensions):
     ["cuda/*.pyx"],
     # tests
     ["tests/*.pyx"],
-    # interal files used by cybind
-    ['cuda/bindings/_internal/*.pyx'],
+
+    # interal files used by cybind. We on
+    ['cuda/bindings/_internal/nvjitlink.pyx'],
+    ['cuda/bindings/_internal/utils.pyx'],
+
 ]
 
 
-rename_architecture_specific_files()
 
 for sources in sources_list:
     extensions += prep_extensions(sources)
 
-# for sources in new_sources_list:
-#     new_extensions += prep_extensions(sources)
-
-
 # ---------------------------------------------------------------------
 # Custom build_ext command
 # Files are build in two steps:
@@ -296,11 +297,6 @@ def finalize_options(self):
 # ----------------------------------------------------------------------
 # Setup
 
-package_data=dict.fromkeys(
-        find_packages(include=["cuda.cuda", "cuda.cuda.*", "cuda.cuda.bindings", "cuda.cuda.bindings._bindings", "cuda.cuda.bindings._lib", "cuda.cuda.bindings._lib.cyruntime", "cuda.cuda.bindings._internal", "tests"]),
-        ["*.pxd", "*.pyx", "*.py", "*.h", "*.cpp"],
-    )
-
 setup(
     version=versioneer.get_version(),
     ext_modules=do_cythonize(extensions),

From 8852a9252441fe101e8e754643387aa1358b004a Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 15 Oct 2024 12:47:51 -0700
Subject: [PATCH 07/34] rebase

---
 .../cuda/bindings/_bindings/nvJitLink.pxd     |  26 ++
 .../bindings/_bindings/nvJitLink_linux.pyx    | 382 +++++++++++++++++
 .../bindings/_bindings/nvJitLink_windows.pyx  | 393 ++++++++++++++++++
 cuda_bindings/cynvJitLink.pxd                 |  48 +++
 cuda_bindings/cynvJitLink.pyx                 |  63 +++
 cuda_bindings/nvJitLink.pxd                   |  46 ++
 cuda_bindings/nvJitLink.pyx                   | 138 ++++++
 cuda_bindings/tests/test_nvJitLink.py         |   3 +
 8 files changed, 1099 insertions(+)
 create mode 100644 cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
 create mode 100644 cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
 create mode 100644 cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
 create mode 100644 cuda_bindings/cynvJitLink.pxd
 create mode 100644 cuda_bindings/cynvJitLink.pyx
 create mode 100644 cuda_bindings/nvJitLink.pxd
 create mode 100644 cuda_bindings/nvJitLink.pyx
 create mode 100644 cuda_bindings/tests/test_nvJitLink.py

diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd b/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
new file mode 100644
index 000000000..dca128a0e
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from ..cynvJitLink cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil
+cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil
+cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil
+cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil
+cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx b/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
new file mode 100644
index 000000000..2fc6ca625
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
@@ -0,0 +1,382 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .utils cimport get_nvJitLink_dso_version_suffix
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+
+###############################################################################
+# Extern
+###############################################################################
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef bint __py_nvJitLink_init = False
+cdef void* __cuDriverGetVersion = NULL
+
+cdef void* __nvJitLinkCreate = NULL
+cdef void* __nvJitLinkDestroy = NULL
+cdef void* __nvJitLinkAddData = NULL
+cdef void* __nvJitLinkAddFile = NULL
+cdef void* __nvJitLinkComplete = NULL
+cdef void* __nvJitLinkGetLinkedCubinSize = NULL
+cdef void* __nvJitLinkGetLinkedCubin = NULL
+cdef void* __nvJitLinkGetLinkedPtxSize = NULL
+cdef void* __nvJitLinkGetLinkedPtx = NULL
+cdef void* __nvJitLinkGetErrorLogSize = NULL
+cdef void* __nvJitLinkGetErrorLog = NULL
+cdef void* __nvJitLinkGetInfoLogSize = NULL
+cdef void* __nvJitLinkGetInfoLog = NULL
+
+
+cdef void* load_library(const int driver_ver) except* with gil:
+    cdef void* handle
+    for suffix in get_nvJitLink_dso_version_suffix(driver_ver):
+        so_name = "libnvJitLink.so" + (f".{suffix}" if suffix else suffix)
+        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
+        if handle != NULL:
+            break
+    else:
+        err_msg = dlerror()
+        raise RuntimeError(f'Failed to dlopen libnvJitLink ({err_msg.decode()})')
+    return handle
+
+
+cdef int _check_or_init_nvJitLink() except -1 nogil:
+    global __py_nvJitLink_init
+    if __py_nvJitLink_init:
+        return 0
+
+    # Load driver to check version
+    cdef void* handle = NULL
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        with gil:
+            err_msg = dlerror()
+            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    global __cuDriverGetVersion
+    if __cuDriverGetVersion == NULL:
+        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if __cuDriverGetVersion == NULL:
+        with gil:
+            raise RuntimeError('something went wrong')
+    cdef int err, driver_ver
+    err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        with gil:
+            raise RuntimeError('something went wrong')
+    #dlclose(handle)
+    handle = NULL
+
+    # Load function
+    global __nvJitLinkCreate
+    __nvJitLinkCreate = dlsym(RTLD_DEFAULT, 'nvJitLinkCreate')
+    if __nvJitLinkCreate == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkCreate = dlsym(handle, 'nvJitLinkCreate')
+    
+    global __nvJitLinkDestroy
+    __nvJitLinkDestroy = dlsym(RTLD_DEFAULT, 'nvJitLinkDestroy')
+    if __nvJitLinkDestroy == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkDestroy = dlsym(handle, 'nvJitLinkDestroy')
+    
+    global __nvJitLinkAddData
+    __nvJitLinkAddData = dlsym(RTLD_DEFAULT, 'nvJitLinkAddData')
+    if __nvJitLinkAddData == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkAddData = dlsym(handle, 'nvJitLinkAddData')
+    
+    global __nvJitLinkAddFile
+    __nvJitLinkAddFile = dlsym(RTLD_DEFAULT, 'nvJitLinkAddFile')
+    if __nvJitLinkAddFile == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkAddFile = dlsym(handle, 'nvJitLinkAddFile')
+    
+    global __nvJitLinkComplete
+    __nvJitLinkComplete = dlsym(RTLD_DEFAULT, 'nvJitLinkComplete')
+    if __nvJitLinkComplete == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkComplete = dlsym(handle, 'nvJitLinkComplete')
+    
+    global __nvJitLinkGetLinkedCubinSize
+    __nvJitLinkGetLinkedCubinSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubinSize')
+    if __nvJitLinkGetLinkedCubinSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedCubinSize = dlsym(handle, 'nvJitLinkGetLinkedCubinSize')
+    
+    global __nvJitLinkGetLinkedCubin
+    __nvJitLinkGetLinkedCubin = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubin')
+    if __nvJitLinkGetLinkedCubin == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedCubin = dlsym(handle, 'nvJitLinkGetLinkedCubin')
+    
+    global __nvJitLinkGetLinkedPtxSize
+    __nvJitLinkGetLinkedPtxSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtxSize')
+    if __nvJitLinkGetLinkedPtxSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedPtxSize = dlsym(handle, 'nvJitLinkGetLinkedPtxSize')
+    
+    global __nvJitLinkGetLinkedPtx
+    __nvJitLinkGetLinkedPtx = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtx')
+    if __nvJitLinkGetLinkedPtx == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetLinkedPtx = dlsym(handle, 'nvJitLinkGetLinkedPtx')
+    
+    global __nvJitLinkGetErrorLogSize
+    __nvJitLinkGetErrorLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLogSize')
+    if __nvJitLinkGetErrorLogSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetErrorLogSize = dlsym(handle, 'nvJitLinkGetErrorLogSize')
+    
+    global __nvJitLinkGetErrorLog
+    __nvJitLinkGetErrorLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLog')
+    if __nvJitLinkGetErrorLog == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetErrorLog = dlsym(handle, 'nvJitLinkGetErrorLog')
+    
+    global __nvJitLinkGetInfoLogSize
+    __nvJitLinkGetInfoLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLogSize')
+    if __nvJitLinkGetInfoLogSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetInfoLogSize = dlsym(handle, 'nvJitLinkGetInfoLogSize')
+    
+    global __nvJitLinkGetInfoLog
+    __nvJitLinkGetInfoLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLog')
+    if __nvJitLinkGetInfoLog == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkGetInfoLog = dlsym(handle, 'nvJitLinkGetInfoLog')
+
+    __py_nvJitLink_init = True
+    return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvJitLink()
+    cdef dict data = {}
+
+    global __nvJitLinkCreate
+    data["__nvJitLinkCreate"] = <intptr_t>__nvJitLinkCreate
+    
+    global __nvJitLinkDestroy
+    data["__nvJitLinkDestroy"] = <intptr_t>__nvJitLinkDestroy
+    
+    global __nvJitLinkAddData
+    data["__nvJitLinkAddData"] = <intptr_t>__nvJitLinkAddData
+    
+    global __nvJitLinkAddFile
+    data["__nvJitLinkAddFile"] = <intptr_t>__nvJitLinkAddFile
+    
+    global __nvJitLinkComplete
+    data["__nvJitLinkComplete"] = <intptr_t>__nvJitLinkComplete
+    
+    global __nvJitLinkGetLinkedCubinSize
+    data["__nvJitLinkGetLinkedCubinSize"] = <intptr_t>__nvJitLinkGetLinkedCubinSize
+    
+    global __nvJitLinkGetLinkedCubin
+    data["__nvJitLinkGetLinkedCubin"] = <intptr_t>__nvJitLinkGetLinkedCubin
+    
+    global __nvJitLinkGetLinkedPtxSize
+    data["__nvJitLinkGetLinkedPtxSize"] = <intptr_t>__nvJitLinkGetLinkedPtxSize
+    
+    global __nvJitLinkGetLinkedPtx
+    data["__nvJitLinkGetLinkedPtx"] = <intptr_t>__nvJitLinkGetLinkedPtx
+    
+    global __nvJitLinkGetErrorLogSize
+    data["__nvJitLinkGetErrorLogSize"] = <intptr_t>__nvJitLinkGetErrorLogSize
+    
+    global __nvJitLinkGetErrorLog
+    data["__nvJitLinkGetErrorLog"] = <intptr_t>__nvJitLinkGetErrorLog
+    
+    global __nvJitLinkGetInfoLogSize
+    data["__nvJitLinkGetInfoLogSize"] = <intptr_t>__nvJitLinkGetInfoLogSize
+    
+    global __nvJitLinkGetInfoLog
+    data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
+    global __nvJitLinkCreate
+    _check_or_init_nvJitLink()
+    if __nvJitLinkCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkCreate is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*, uint32_t, const char**) nogil>__nvJitLinkCreate)(
+        handle, numOptions, options)
+
+
+cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
+    global __nvJitLinkDestroy
+    _check_or_init_nvJitLink()
+    if __nvJitLinkDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*) nogil>__nvJitLinkDestroy)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
+    global __nvJitLinkAddData
+    _check_or_init_nvJitLink()
+    if __nvJitLinkAddData == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddData is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const void*, size_t, const char*) nogil>__nvJitLinkAddData)(
+        handle, inputType, data, size, name)
+
+
+cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
+    global __nvJitLinkAddFile
+    _check_or_init_nvJitLink()
+    if __nvJitLinkAddFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const char*) nogil>__nvJitLinkAddFile)(
+        handle, inputType, fileName)
+
+
+cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
+    global __nvJitLinkComplete
+    _check_or_init_nvJitLink()
+    if __nvJitLinkComplete == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkComplete is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle) nogil>__nvJitLinkComplete)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedCubinSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedCubinSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedCubinSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
+    global __nvJitLinkGetLinkedCubin
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedCubin == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) nogil>__nvJitLinkGetLinkedCubin)(
+        handle, cubin)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedPtxSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedPtxSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedPtxSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
+    global __nvJitLinkGetLinkedPtx
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedPtx == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetLinkedPtx)(
+        handle, ptx)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetErrorLogSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetErrorLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetErrorLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetErrorLog
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetErrorLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetErrorLog)(
+        handle, log)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetInfoLogSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetInfoLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetInfoLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetInfoLog
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetInfoLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
+        handle, log)
diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx b/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
new file mode 100644
index 000000000..8856b59ca
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
@@ -0,0 +1,393 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .utils cimport get_nvJitLink_dso_version_suffix
+
+import os
+import site
+
+import win32api
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+cdef bint __py_nvJitLink_init = False
+cdef void* __cuDriverGetVersion = NULL
+
+cdef void* __nvJitLinkCreate = NULL
+cdef void* __nvJitLinkDestroy = NULL
+cdef void* __nvJitLinkAddData = NULL
+cdef void* __nvJitLinkAddFile = NULL
+cdef void* __nvJitLinkComplete = NULL
+cdef void* __nvJitLinkGetLinkedCubinSize = NULL
+cdef void* __nvJitLinkGetLinkedCubin = NULL
+cdef void* __nvJitLinkGetLinkedPtxSize = NULL
+cdef void* __nvJitLinkGetLinkedPtx = NULL
+cdef void* __nvJitLinkGetErrorLogSize = NULL
+cdef void* __nvJitLinkGetErrorLog = NULL
+cdef void* __nvJitLinkGetInfoLogSize = NULL
+cdef void* __nvJitLinkGetInfoLog = NULL
+
+
+cdef inline list get_site_packages():
+    return [site.getusersitepackages()] + site.getsitepackages()
+
+
+cdef load_library(const int driver_ver):
+    handle = 0
+
+    for suffix in get_nvJitLink_dso_version_suffix(driver_ver):
+        if len(suffix) == 0:
+            continue
+        dll_name = f"nvJitLink64_{suffix}.dll"
+
+        # First check if the DLL has been loaded by 3rd parties
+        try:
+            handle = win32api.GetModuleHandle(dll_name)
+        except:
+            pass
+        else:
+            break
+
+        # Next, check if DLLs are installed via pip
+        for sp in get_site_packages():
+            mod_path = os.path.join(sp, "nvidia", "nvJitLink", "bin")
+            if not os.path.isdir(mod_path):
+                continue
+            os.add_dll_directory(mod_path)
+        try:
+            handle = win32api.LoadLibraryEx(
+                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
+                os.path.join(mod_path, dll_name),
+                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
+        except:
+            pass
+        else:
+            break
+
+        # Finally, try default search
+        try:
+            handle = win32api.LoadLibrary(dll_name)
+        except:
+            pass
+        else:
+            break
+    else:
+        raise RuntimeError('Failed to load nvJitLink')
+
+    assert handle != 0
+    return handle
+
+
+cdef int _check_or_init_nvJitLink() except -1 nogil:
+    global __py_nvJitLink_init
+    if __py_nvJitLink_init:
+        return 0
+
+    cdef int err, driver_ver
+    with gil:
+        # Load driver to check version
+        try:
+            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
+        except Exception as e:
+            raise NotSupportedError(f'CUDA driver is not found ({e})')
+        global __cuDriverGetVersion
+        if __cuDriverGetVersion == NULL:
+            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
+            if __cuDriverGetVersion == NULL:
+                raise RuntimeError('something went wrong')
+        err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
+        if err != 0:
+            raise RuntimeError('something went wrong')
+
+        # Load library
+        handle = load_library(driver_ver)
+
+        # Load function
+        global __nvJitLinkCreate
+        try:
+            __nvJitLinkCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkCreate')
+        except:
+            pass
+    
+        global __nvJitLinkDestroy
+        try:
+            __nvJitLinkDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkDestroy')
+        except:
+            pass
+    
+        global __nvJitLinkAddData
+        try:
+            __nvJitLinkAddData = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkAddData')
+        except:
+            pass
+    
+        global __nvJitLinkAddFile
+        try:
+            __nvJitLinkAddFile = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkAddFile')
+        except:
+            pass
+    
+        global __nvJitLinkComplete
+        try:
+            __nvJitLinkComplete = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkComplete')
+        except:
+            pass
+    
+        global __nvJitLinkGetLinkedCubinSize
+        try:
+            __nvJitLinkGetLinkedCubinSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedCubinSize')
+        except:
+            pass
+    
+        global __nvJitLinkGetLinkedCubin
+        try:
+            __nvJitLinkGetLinkedCubin = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedCubin')
+        except:
+            pass
+    
+        global __nvJitLinkGetLinkedPtxSize
+        try:
+            __nvJitLinkGetLinkedPtxSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedPtxSize')
+        except:
+            pass
+    
+        global __nvJitLinkGetLinkedPtx
+        try:
+            __nvJitLinkGetLinkedPtx = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedPtx')
+        except:
+            pass
+    
+        global __nvJitLinkGetErrorLogSize
+        try:
+            __nvJitLinkGetErrorLogSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetErrorLogSize')
+        except:
+            pass
+    
+        global __nvJitLinkGetErrorLog
+        try:
+            __nvJitLinkGetErrorLog = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetErrorLog')
+        except:
+            pass
+    
+        global __nvJitLinkGetInfoLogSize
+        try:
+            __nvJitLinkGetInfoLogSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetInfoLogSize')
+        except:
+            pass
+    
+        global __nvJitLinkGetInfoLog
+        try:
+            __nvJitLinkGetInfoLog = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetInfoLog')
+        except:
+            pass
+
+    __py_nvJitLink_init = True
+    return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvJitLink()
+    cdef dict data = {}
+
+    global __nvJitLinkCreate
+    data["__nvJitLinkCreate"] = <intptr_t>__nvJitLinkCreate
+    
+    global __nvJitLinkDestroy
+    data["__nvJitLinkDestroy"] = <intptr_t>__nvJitLinkDestroy
+    
+    global __nvJitLinkAddData
+    data["__nvJitLinkAddData"] = <intptr_t>__nvJitLinkAddData
+    
+    global __nvJitLinkAddFile
+    data["__nvJitLinkAddFile"] = <intptr_t>__nvJitLinkAddFile
+    
+    global __nvJitLinkComplete
+    data["__nvJitLinkComplete"] = <intptr_t>__nvJitLinkComplete
+    
+    global __nvJitLinkGetLinkedCubinSize
+    data["__nvJitLinkGetLinkedCubinSize"] = <intptr_t>__nvJitLinkGetLinkedCubinSize
+    
+    global __nvJitLinkGetLinkedCubin
+    data["__nvJitLinkGetLinkedCubin"] = <intptr_t>__nvJitLinkGetLinkedCubin
+    
+    global __nvJitLinkGetLinkedPtxSize
+    data["__nvJitLinkGetLinkedPtxSize"] = <intptr_t>__nvJitLinkGetLinkedPtxSize
+    
+    global __nvJitLinkGetLinkedPtx
+    data["__nvJitLinkGetLinkedPtx"] = <intptr_t>__nvJitLinkGetLinkedPtx
+    
+    global __nvJitLinkGetErrorLogSize
+    data["__nvJitLinkGetErrorLogSize"] = <intptr_t>__nvJitLinkGetErrorLogSize
+    
+    global __nvJitLinkGetErrorLog
+    data["__nvJitLinkGetErrorLog"] = <intptr_t>__nvJitLinkGetErrorLog
+    
+    global __nvJitLinkGetInfoLogSize
+    data["__nvJitLinkGetInfoLogSize"] = <intptr_t>__nvJitLinkGetInfoLogSize
+    
+    global __nvJitLinkGetInfoLog
+    data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
+    global __nvJitLinkCreate
+    _check_or_init_nvJitLink()
+    if __nvJitLinkCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkCreate is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*, uint32_t, const char**) nogil>__nvJitLinkCreate)(
+        handle, numOptions, options)
+
+
+cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
+    global __nvJitLinkDestroy
+    _check_or_init_nvJitLink()
+    if __nvJitLinkDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle*) nogil>__nvJitLinkDestroy)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
+    global __nvJitLinkAddData
+    _check_or_init_nvJitLink()
+    if __nvJitLinkAddData == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddData is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const void*, size_t, const char*) nogil>__nvJitLinkAddData)(
+        handle, inputType, data, size, name)
+
+
+cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
+    global __nvJitLinkAddFile
+    _check_or_init_nvJitLink()
+    if __nvJitLinkAddFile == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const char*) nogil>__nvJitLinkAddFile)(
+        handle, inputType, fileName)
+
+
+cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
+    global __nvJitLinkComplete
+    _check_or_init_nvJitLink()
+    if __nvJitLinkComplete == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkComplete is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle) nogil>__nvJitLinkComplete)(
+        handle)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedCubinSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedCubinSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedCubinSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
+    global __nvJitLinkGetLinkedCubin
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedCubin == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) nogil>__nvJitLinkGetLinkedCubin)(
+        handle, cubin)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetLinkedPtxSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedPtxSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedPtxSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
+    global __nvJitLinkGetLinkedPtx
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetLinkedPtx == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetLinkedPtx)(
+        handle, ptx)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetErrorLogSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetErrorLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetErrorLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetErrorLog
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetErrorLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetErrorLog)(
+        handle, log)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    global __nvJitLinkGetInfoLogSize
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetInfoLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetInfoLogSize)(
+        handle, size)
+
+
+cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
+    global __nvJitLinkGetInfoLog
+    _check_or_init_nvJitLink()
+    if __nvJitLinkGetInfoLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
+    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
+        handle, log)
diff --git a/cuda_bindings/cynvJitLink.pxd b/cuda_bindings/cynvJitLink.pxd
new file mode 100644
index 000000000..ed440c0b3
--- /dev/null
+++ b/cuda_bindings/cynvJitLink.pxd
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+
+from libc.stdint cimport int64_t
+
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+# enums
+
+
+
+# types
+cdef extern from *:
+    """
+    #include <driver_types.h>
+    #include <library_types.h>
+    #include <cuComplex.h>
+    """
+    ctypedef void* cudaStream_t 'cudaStream_t'
+
+
+
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef nvJitLinkResult nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil
+cdef nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil
+cdef nvJitLinkResult nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil
+cdef nvJitLinkResult nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil
+cdef nvJitLinkResult nvJitLinkComplete(nvJitLinkHandle handle) except* nogil
+cdef nvJitLinkResult nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil
+cdef nvJitLinkResult nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil
+cdef nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
+cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
+cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
diff --git a/cuda_bindings/cynvJitLink.pyx b/cuda_bindings/cynvJitLink.pyx
new file mode 100644
index 000000000..65d3f9840
--- /dev/null
+++ b/cuda_bindings/cynvJitLink.pyx
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from ._internal cimport nvJitLink as _nvJitLink
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvJitLinkResult nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
+    return _nvJitLink._nvJitLinkCreate(handle, numOptions, options)
+
+
+cdef nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
+    return _nvJitLink._nvJitLinkDestroy(handle)
+
+
+cdef nvJitLinkResult nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
+    return _nvJitLink._nvJitLinkAddData(handle, inputType, data, size, name)
+
+
+cdef nvJitLinkResult nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
+    return _nvJitLink._nvJitLinkAddFile(handle, inputType, fileName)
+
+
+cdef nvJitLinkResult nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
+    return _nvJitLink._nvJitLinkComplete(handle)
+
+
+cdef nvJitLinkResult nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    return _nvJitLink._nvJitLinkGetLinkedCubinSize(handle, size)
+
+
+cdef nvJitLinkResult nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
+    return _nvJitLink._nvJitLinkGetLinkedCubin(handle, cubin)
+
+
+cdef nvJitLinkResult nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    return _nvJitLink._nvJitLinkGetLinkedPtxSize(handle, size)
+
+
+cdef nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
+    return _nvJitLink._nvJitLinkGetLinkedPtx(handle, ptx)
+
+
+cdef nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    return _nvJitLink._nvJitLinkGetErrorLogSize(handle, size)
+
+
+cdef nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
+    return _nvJitLink._nvJitLinkGetErrorLog(handle, log)
+
+
+cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
+    return _nvJitLink._nvJitLinkGetInfoLogSize(handle, size)
+
+
+cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
+    return _nvJitLink._nvJitLinkGetInfoLog(handle, log)
diff --git a/cuda_bindings/nvJitLink.pxd b/cuda_bindings/nvJitLink.pxd
new file mode 100644
index 000000000..d063002be
--- /dev/null
+++ b/cuda_bindings/nvJitLink.pxd
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .cynvJitLink cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+
+
+ctypedef cudaStream_t Stream
+ctypedef cudaDataType DataType
+ctypedef libraryPropertyType_t LibraryPropertyType
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef create(intptr_t handle, uint32_t num_options, intptr_t options)
+cpdef destroy(intptr_t handle)
+cpdef add_data(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t data, size_t size, intptr_t name)
+cpdef add_file(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t file_name)
+cpdef complete(nvJitLinkHandle handle)
+cpdef get_linked_cubin_size(nvJitLinkHandle handle, intptr_t size)
+cpdef get_linked_cubin(nvJitLinkHandle handle, intptr_t cubin)
+cpdef get_linked_ptx_size(nvJitLinkHandle handle, intptr_t size)
+cpdef get_linked_ptx(nvJitLinkHandle handle, intptr_t ptx)
+cpdef get_error_log_size(nvJitLinkHandle handle, intptr_t size)
+cpdef get_error_log(nvJitLinkHandle handle, intptr_t log)
+cpdef get_info_log_size(nvJitLinkHandle handle, intptr_t size)
+cpdef get_info_log(nvJitLinkHandle handle, intptr_t log)
diff --git a/cuda_bindings/nvJitLink.pyx b/cuda_bindings/nvJitLink.pyx
new file mode 100644
index 000000000..18f4c7545
--- /dev/null
+++ b/cuda_bindings/nvJitLink.pyx
@@ -0,0 +1,138 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
+
+cimport cython  # NOQA
+
+from enum import IntEnum as _IntEnum
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+cdef dict STATUS={
+    NVJITLINK_SUCCESS                   : 'NVJITLINK_SUCCESS',
+    NVJITLINK_ERROR_UNRECOGNIZED_OPTION : 'NVJITLINK_ERROR_UNRECOGNIZED_OPTION',
+    NVJITLINK_ERROR_MISSING_ARCH        : 'NVJITLINK_ERROR_MISSING_ARCH', // -arch=sm_NN option not specified
+    NVJITLINK_ERROR_INVALID_INPUT       : 'NVJITLINK_ERROR_INVALID_INPUT',
+    NVJITLINK_ERROR_PTX_COMPILE         : 'NVJITLINK_ERROR_PTX_COMPILE',
+    NVJITLINK_ERROR_NVVM_COMPILE        : 'NVJITLINK_ERROR_NVVM_COMPILE',
+    NVJITLINK_ERROR_INTERNAL            : 'NVJITLINK_ERROR_INTERNAL',
+    NVJITLINK_ERROR_THREADPOOL          : 'NVJITLINK_ERROR_THREADPOOL',
+    NVJITLINK_ERROR_UNRECOGNIZED_INPUT  : 'NVJITLINK_ERROR_UNRECOGNIZED_INPUT',
+    NVJITLINK_ERROR_NULL_INPUT          : 'NVJITLINK_ERROR_NULL_INPUT',
+    NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS: 'NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS',
+    NVJITLINK_ERROR_INCORRECT_INPUT_TYPE: 'NVJITLINK_ERROR_INCORRECT_INPUT_TYPE',
+    NVJITLINK_ERROR_ARCH_MISMATCH       : 'NVJITLINK_ERROR_ARCH_MISMATCH',
+    NVJITLINK_ERROR_OUTDATED_LIBRARY    : 'NVJITLINK_ERROR_OUTDATED_LIBRARY',
+    NVJITLINK_ERROR_MISSING_FATBIN      : 'NVJITLINK_ERROR_MISSING_FATBIN'
+}
+
+class nvJitLinkError(Exception):
+
+    def __init__(self, status):
+        self.status = status
+        cdef str err = STATUS[status]
+        super(nvJitLinkError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cdef inline void check_status(int status) nogil:
+    if status != 0:
+        with gil:
+            raise nvJitLinkError(status)
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef create(intptr_t handle, uint32_t num_options, intptr_t options):
+    with nogil:
+        status = nvJitLinkCreate(<nvJitLinkHandle*>handle, num_options, <const char**>options)
+        _check_status(status)
+
+
+cpdef destroy(intptr_t handle):
+    with nogil:
+        status = nvJitLinkDestroy(<nvJitLinkHandle*>handle)
+        _check_status(status)
+
+
+cpdef add_data(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t data, size_t size, intptr_t name):
+    with nogil:
+        status = nvJitLinkAddData(handle, input_type, <const void*>data, size, <const char*>name)
+        _check_status(status)
+
+
+cpdef add_file(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t file_name):
+    with nogil:
+        status = nvJitLinkAddFile(handle, input_type, <const char*>file_name)
+        _check_status(status)
+
+
+cpdef complete(nvJitLinkHandle handle):
+    with nogil:
+        status = nvJitLinkComplete(handle)
+        _check_status(status)
+
+
+cpdef get_linked_cubin_size(nvJitLinkHandle handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetLinkedCubinSize(handle, <size_t*>size)
+        _check_status(status)
+
+
+cpdef get_linked_cubin(nvJitLinkHandle handle, intptr_t cubin):
+    with nogil:
+        status = nvJitLinkGetLinkedCubin(handle, <void*>cubin)
+        _check_status(status)
+
+
+cpdef get_linked_ptx_size(nvJitLinkHandle handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetLinkedPtxSize(handle, <size_t*>size)
+        _check_status(status)
+
+
+cpdef get_linked_ptx(nvJitLinkHandle handle, intptr_t ptx):
+    with nogil:
+        status = nvJitLinkGetLinkedPtx(handle, <char*>ptx)
+        _check_status(status)
+
+
+cpdef get_error_log_size(nvJitLinkHandle handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetErrorLogSize(handle, <size_t*>size)
+        _check_status(status)
+
+
+cpdef get_error_log(nvJitLinkHandle handle, intptr_t log):
+    with nogil:
+        status = nvJitLinkGetErrorLog(handle, <char*>log)
+        _check_status(status)
+
+
+cpdef get_info_log_size(nvJitLinkHandle handle, intptr_t size):
+    with nogil:
+        status = nvJitLinkGetInfoLogSize(handle, <size_t*>size)
+        _check_status(status)
+
+
+cpdef get_info_log(nvJitLinkHandle handle, intptr_t log):
+    with nogil:
+        status = nvJitLinkGetInfoLog(handle, <char*>log)
+        _check_status(status)
diff --git a/cuda_bindings/tests/test_nvJitLink.py b/cuda_bindings/tests/test_nvJitLink.py
new file mode 100644
index 000000000..7ced5ff38
--- /dev/null
+++ b/cuda_bindings/tests/test_nvJitLink.py
@@ -0,0 +1,3 @@
+import pytest
+from cuda import nvJitLink
+

From fb1198a371ec5f361666715f5a3acce9ef159533 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 15 Oct 2024 10:11:07 -0700
Subject: [PATCH 08/34]  add test file

---
 cuda_bindings/tests/test_nvJitLink.py | 161 ++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)

diff --git a/cuda_bindings/tests/test_nvJitLink.py b/cuda_bindings/tests/test_nvJitLink.py
index 7ced5ff38..f566ae7c6 100644
--- a/cuda_bindings/tests/test_nvJitLink.py
+++ b/cuda_bindings/tests/test_nvJitLink.py
@@ -1,3 +1,164 @@
 import pytest
 from cuda import nvJitLink
 
+def test_create_no_arch_error():
+    # nvjitlink expects at least the architecture to be specified.
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_MISSING_ARCH error"):
+        nvJitLink.create()
+
+
+def test_invalid_arch_error():
+    # sm_XX is not a valid architecture
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
+        nvJitLink.create("-arch=sm_XX")
+
+
+def test_unrecognized_option_error():
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
+        nvJitLink.create("-fictitious_option")
+
+
+def test_invalid_option_type_error():
+    with pytest.raises(TypeError, match="Expecting only strings"):
+        nvJitLink.create("-arch", 53)
+
+
+def test_create_and_destroy():
+    handle = nvJitLink.create("-arch=sm_53")
+    assert handle != 0
+    nvJitLink.destroy(handle)
+
+
+def test_complete_empty():
+    handle = nvJitLink.create("-arch=sm_75")
+    nvJitLink.complete(handle)
+    nvJitLink.destroy(handle)
+
+
+@pytest.mark.parametrize(
+    "input_file,input_type",
+    [
+        ("device_functions_cubin", nvJitLink.InputType.CUBIN),
+        ("device_functions_fatbin", InputType.FATBIN),
+        ("device_functions_ptx", InputType.PTX),
+        ("device_functions_object", InputType.OBJECT),
+        ("device_functions_archive", InputType.LIBRARY),
+    ],
+)
+def test_add_file(input_file, input_type, gpu_arch_flag, request):
+    filename, data = request.getfixturevalue(input_file)
+
+    handle = nvJitLink.create(gpu_arch_flag)
+    nvJitLink.add_data(handle, input_type.value, data, filename)
+    nvJitLink.destroy(handle)
+
+
+# We test the LTO input case separately as it requires the `-lto` flag. The
+# OBJECT input type is used because the LTO-IR container is packaged in an ELF
+# object when produced by NVCC.
+def test_add_file_lto(device_functions_ltoir_object, gpu_arch_flag):
+    filename, data = device_functions_ltoir_object
+
+    handle = nvJitLink.create(gpu_arch_flag, "-lto")
+    nvJitLink.add_data(handle, InputType.OBJECT.value, data, filename)
+    nvJitLink.destroy(handle)
+
+
+def test_get_error_log(undefined_extern_cubin, gpu_arch_flag):
+    handle = nvJitLink.create(gpu_arch_flag)
+    filename, data = undefined_extern_cubin
+    input_type = InputType.CUBIN.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    with pytest.raises(RuntimeError):
+        nvJitLink.complete(handle)
+    error_log = nvJitLink.get_error_log(handle)
+    nvJitLink.destroy(handle)
+    assert (
+        "Undefined reference to '_Z5undefff' "
+        "in 'undefined_extern.cubin'" in error_log
+    )
+
+
+def test_get_info_log(device_functions_cubin, gpu_arch_flag):
+    handle = nvJitLink.create(gpu_arch_flag)
+    filename, data = device_functions_cubin
+    input_type = InputType.CUBIN.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    nvJitLink.complete(handle)
+    info_log = nvJitLink.get_info_log(handle)
+    nvJitLink.destroy(handle)
+    # Info log is empty
+    assert "" == info_log
+
+
+def test_get_linked_cubin(device_functions_cubin, gpu_arch_flag):
+    handle = nvJitLink.create(gpu_arch_flag)
+    filename, data = device_functions_cubin
+    input_type = InputType.CUBIN.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    nvJitLink.complete(handle)
+    cubin = nvJitLink.get_linked_cubin(handle)
+    nvJitLink.destroy(handle)
+
+    # Just check we got something that looks like an ELF
+    assert cubin[:4] == b"\x7fELF"
+
+
+def test_get_linked_cubin_link_not_complete_error(
+    device_functions_cubin, gpu_arch_flag
+):
+    handle = nvJitLink.create(gpu_arch_flag)
+    filename, data = device_functions_cubin
+    input_type = InputType.CUBIN.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
+        nvJitLink.get_linked_cubin(handle)
+    nvJitLink.destroy(handle)
+
+
+def test_get_linked_cubin_from_lto(device_functions_ltoir_object, gpu_arch_flag):
+    filename, data = device_functions_ltoir_object
+    # device_functions_ltoir_object is a host object containing a fatbin
+    # containing an LTOIR container, because that is what NVCC produces when
+    # LTO is requested. So we need to use the OBJECT input type, and the linker
+    # retrieves the LTO IR from it because we passed the -lto flag.
+    input_type = InputType.OBJECT.value
+    handle = nvJitLink.create(gpu_arch_flag, "-lto")
+    nvJitLink.add_data(handle, input_type, data, filename)
+    nvJitLink.complete(handle)
+    cubin = nvJitLink.get_linked_cubin(handle)
+    nvJitLink.destroy(handle)
+
+    # Just check we got something that looks like an ELF
+    assert cubin[:4] == b"\x7fELF"
+
+
+def test_get_linked_ptx_from_lto(device_functions_ltoir_object, gpu_arch_flag):
+    filename, data = device_functions_ltoir_object
+    # device_functions_ltoir_object is a host object containing a fatbin
+    # containing an LTOIR container, because that is what NVCC produces when
+    # LTO is requested. So we need to use the OBJECT input type, and the linker
+    # retrieves the LTO IR from it because we passed the -lto flag.
+    input_type = InputType.OBJECT.value
+    handle = nvJitLink.create(gpu_arch_flag, "-lto", "-ptx")
+    nvJitLink.add_data(handle, input_type, data, filename)
+    nvJitLink.complete(handle)
+    nvJitLink.get_linked_ptx(handle)
+    nvJitLink.destroy(handle)
+
+
+def test_get_linked_ptx_link_not_complete_error(
+    device_functions_ltoir_object, gpu_arch_flag
+):
+    handle = nvJitLink.create(gpu_arch_flag, "-lto", "-ptx")
+    filename, data = device_functions_ltoir_object
+    input_type = InputType.OBJECT.value
+    nvJitLink.add_data(handle, input_type, data, filename)
+    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
+        nvJitLink.get_linked_ptx(handle)
+    nvJitLink.destroy(handle)
+
+
+def test_package_version():
+    assert pynvjitlink.__version__ is not None
+    assert len(str(pynvjitlink.__version__)) > 0
\ No newline at end of file

From 2e4955f7a2ca7ebb4b67a99f397b78a6950fdcfb Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 16 Oct 2024 16:46:51 -0700
Subject: [PATCH 09/34] rebase

---
 .../cuda/bindings/_bindings/nvJitLink.pxd     |  26 --
 .../bindings/_bindings/nvJitLink_linux.pyx    | 382 -----------------
 .../bindings/_bindings/nvJitLink_windows.pyx  | 393 ------------------
 .../bindings/_internal/nvjitlink_windows.pyx  |   8 +
 cuda_bindings/cuda/bindings/cynvjitlink.pxd   |   4 +
 cuda_bindings/cuda/bindings/cynvjitlink.pyx   |   3 +
 cuda_bindings/cynvJitLink.pxd                 |  48 ---
 cuda_bindings/cynvJitLink.pyx                 |  63 ---
 cuda_bindings/nvJitLink.pxd                   |  46 --
 cuda_bindings/nvJitLink.pyx                   | 138 ------
 cuda_bindings/setup.py                        |  14 +-
 cuda_bindings/tests/test_nvJitLink.py         | 164 --------
 12 files changed, 27 insertions(+), 1262 deletions(-)
 delete mode 100644 cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
 delete mode 100644 cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
 delete mode 100644 cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
 delete mode 100644 cuda_bindings/cynvJitLink.pxd
 delete mode 100644 cuda_bindings/cynvJitLink.pyx
 delete mode 100644 cuda_bindings/nvJitLink.pxd
 delete mode 100644 cuda_bindings/nvJitLink.pyx
 delete mode 100644 cuda_bindings/tests/test_nvJitLink.py

diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd b/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
deleted file mode 100644
index dca128a0e..000000000
--- a/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
-
-from ..cynvJitLink cimport *
-
-
-###############################################################################
-# Wrapper functions
-###############################################################################
-
-cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil
-cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil
-cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil
-cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil
-cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx b/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
deleted file mode 100644
index 2fc6ca625..000000000
--- a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
+++ /dev/null
@@ -1,382 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
-
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_nvJitLink_dso_version_suffix
-
-from .utils import FunctionNotFoundError, NotSupportedError
-
-
-###############################################################################
-# Extern
-###############################################################################
-
-cdef extern from "<dlfcn.h>" nogil:
-    void* dlopen(const char*, int)
-    char* dlerror()
-    void* dlsym(void*, const char*)
-    int dlclose(void*)
-
-    enum:
-        RTLD_LAZY
-        RTLD_NOW
-        RTLD_GLOBAL
-        RTLD_LOCAL
-
-    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
-
-
-###############################################################################
-# Wrapper init
-###############################################################################
-
-cdef bint __py_nvJitLink_init = False
-cdef void* __cuDriverGetVersion = NULL
-
-cdef void* __nvJitLinkCreate = NULL
-cdef void* __nvJitLinkDestroy = NULL
-cdef void* __nvJitLinkAddData = NULL
-cdef void* __nvJitLinkAddFile = NULL
-cdef void* __nvJitLinkComplete = NULL
-cdef void* __nvJitLinkGetLinkedCubinSize = NULL
-cdef void* __nvJitLinkGetLinkedCubin = NULL
-cdef void* __nvJitLinkGetLinkedPtxSize = NULL
-cdef void* __nvJitLinkGetLinkedPtx = NULL
-cdef void* __nvJitLinkGetErrorLogSize = NULL
-cdef void* __nvJitLinkGetErrorLog = NULL
-cdef void* __nvJitLinkGetInfoLogSize = NULL
-cdef void* __nvJitLinkGetInfoLog = NULL
-
-
-cdef void* load_library(const int driver_ver) except* with gil:
-    cdef void* handle
-    for suffix in get_nvJitLink_dso_version_suffix(driver_ver):
-        so_name = "libnvJitLink.so" + (f".{suffix}" if suffix else suffix)
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libnvJitLink ({err_msg.decode()})')
-    return handle
-
-
-cdef int _check_or_init_nvJitLink() except -1 nogil:
-    global __py_nvJitLink_init
-    if __py_nvJitLink_init:
-        return 0
-
-    # Load driver to check version
-    cdef void* handle = NULL
-    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
-    if handle == NULL:
-        with gil:
-            err_msg = dlerror()
-            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
-    global __cuDriverGetVersion
-    if __cuDriverGetVersion == NULL:
-        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
-    if __cuDriverGetVersion == NULL:
-        with gil:
-            raise RuntimeError('something went wrong')
-    cdef int err, driver_ver
-    err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
-    if err != 0:
-        with gil:
-            raise RuntimeError('something went wrong')
-    #dlclose(handle)
-    handle = NULL
-
-    # Load function
-    global __nvJitLinkCreate
-    __nvJitLinkCreate = dlsym(RTLD_DEFAULT, 'nvJitLinkCreate')
-    if __nvJitLinkCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkCreate = dlsym(handle, 'nvJitLinkCreate')
-    
-    global __nvJitLinkDestroy
-    __nvJitLinkDestroy = dlsym(RTLD_DEFAULT, 'nvJitLinkDestroy')
-    if __nvJitLinkDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkDestroy = dlsym(handle, 'nvJitLinkDestroy')
-    
-    global __nvJitLinkAddData
-    __nvJitLinkAddData = dlsym(RTLD_DEFAULT, 'nvJitLinkAddData')
-    if __nvJitLinkAddData == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkAddData = dlsym(handle, 'nvJitLinkAddData')
-    
-    global __nvJitLinkAddFile
-    __nvJitLinkAddFile = dlsym(RTLD_DEFAULT, 'nvJitLinkAddFile')
-    if __nvJitLinkAddFile == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkAddFile = dlsym(handle, 'nvJitLinkAddFile')
-    
-    global __nvJitLinkComplete
-    __nvJitLinkComplete = dlsym(RTLD_DEFAULT, 'nvJitLinkComplete')
-    if __nvJitLinkComplete == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkComplete = dlsym(handle, 'nvJitLinkComplete')
-    
-    global __nvJitLinkGetLinkedCubinSize
-    __nvJitLinkGetLinkedCubinSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubinSize')
-    if __nvJitLinkGetLinkedCubinSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetLinkedCubinSize = dlsym(handle, 'nvJitLinkGetLinkedCubinSize')
-    
-    global __nvJitLinkGetLinkedCubin
-    __nvJitLinkGetLinkedCubin = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubin')
-    if __nvJitLinkGetLinkedCubin == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetLinkedCubin = dlsym(handle, 'nvJitLinkGetLinkedCubin')
-    
-    global __nvJitLinkGetLinkedPtxSize
-    __nvJitLinkGetLinkedPtxSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtxSize')
-    if __nvJitLinkGetLinkedPtxSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetLinkedPtxSize = dlsym(handle, 'nvJitLinkGetLinkedPtxSize')
-    
-    global __nvJitLinkGetLinkedPtx
-    __nvJitLinkGetLinkedPtx = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtx')
-    if __nvJitLinkGetLinkedPtx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetLinkedPtx = dlsym(handle, 'nvJitLinkGetLinkedPtx')
-    
-    global __nvJitLinkGetErrorLogSize
-    __nvJitLinkGetErrorLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLogSize')
-    if __nvJitLinkGetErrorLogSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetErrorLogSize = dlsym(handle, 'nvJitLinkGetErrorLogSize')
-    
-    global __nvJitLinkGetErrorLog
-    __nvJitLinkGetErrorLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLog')
-    if __nvJitLinkGetErrorLog == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetErrorLog = dlsym(handle, 'nvJitLinkGetErrorLog')
-    
-    global __nvJitLinkGetInfoLogSize
-    __nvJitLinkGetInfoLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLogSize')
-    if __nvJitLinkGetInfoLogSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetInfoLogSize = dlsym(handle, 'nvJitLinkGetInfoLogSize')
-    
-    global __nvJitLinkGetInfoLog
-    __nvJitLinkGetInfoLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLog')
-    if __nvJitLinkGetInfoLog == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetInfoLog = dlsym(handle, 'nvJitLinkGetInfoLog')
-
-    __py_nvJitLink_init = True
-    return 0
-
-
-cdef dict func_ptrs = None
-
-
-cpdef dict _inspect_function_pointers():
-    global func_ptrs
-    if func_ptrs is not None:
-        return func_ptrs
-
-    _check_or_init_nvJitLink()
-    cdef dict data = {}
-
-    global __nvJitLinkCreate
-    data["__nvJitLinkCreate"] = <intptr_t>__nvJitLinkCreate
-    
-    global __nvJitLinkDestroy
-    data["__nvJitLinkDestroy"] = <intptr_t>__nvJitLinkDestroy
-    
-    global __nvJitLinkAddData
-    data["__nvJitLinkAddData"] = <intptr_t>__nvJitLinkAddData
-    
-    global __nvJitLinkAddFile
-    data["__nvJitLinkAddFile"] = <intptr_t>__nvJitLinkAddFile
-    
-    global __nvJitLinkComplete
-    data["__nvJitLinkComplete"] = <intptr_t>__nvJitLinkComplete
-    
-    global __nvJitLinkGetLinkedCubinSize
-    data["__nvJitLinkGetLinkedCubinSize"] = <intptr_t>__nvJitLinkGetLinkedCubinSize
-    
-    global __nvJitLinkGetLinkedCubin
-    data["__nvJitLinkGetLinkedCubin"] = <intptr_t>__nvJitLinkGetLinkedCubin
-    
-    global __nvJitLinkGetLinkedPtxSize
-    data["__nvJitLinkGetLinkedPtxSize"] = <intptr_t>__nvJitLinkGetLinkedPtxSize
-    
-    global __nvJitLinkGetLinkedPtx
-    data["__nvJitLinkGetLinkedPtx"] = <intptr_t>__nvJitLinkGetLinkedPtx
-    
-    global __nvJitLinkGetErrorLogSize
-    data["__nvJitLinkGetErrorLogSize"] = <intptr_t>__nvJitLinkGetErrorLogSize
-    
-    global __nvJitLinkGetErrorLog
-    data["__nvJitLinkGetErrorLog"] = <intptr_t>__nvJitLinkGetErrorLog
-    
-    global __nvJitLinkGetInfoLogSize
-    data["__nvJitLinkGetInfoLogSize"] = <intptr_t>__nvJitLinkGetInfoLogSize
-    
-    global __nvJitLinkGetInfoLog
-    data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
-
-    func_ptrs = data
-    return data
-
-
-cpdef _inspect_function_pointer(str name):
-    global func_ptrs
-    if func_ptrs is None:
-        func_ptrs = _inspect_function_pointers()
-    return func_ptrs[name]
-
-
-###############################################################################
-# Wrapper functions
-###############################################################################
-
-cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
-    global __nvJitLinkCreate
-    _check_or_init_nvJitLink()
-    if __nvJitLinkCreate == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkCreate is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle*, uint32_t, const char**) nogil>__nvJitLinkCreate)(
-        handle, numOptions, options)
-
-
-cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
-    global __nvJitLinkDestroy
-    _check_or_init_nvJitLink()
-    if __nvJitLinkDestroy == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle*) nogil>__nvJitLinkDestroy)(
-        handle)
-
-
-cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
-    global __nvJitLinkAddData
-    _check_or_init_nvJitLink()
-    if __nvJitLinkAddData == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkAddData is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const void*, size_t, const char*) nogil>__nvJitLinkAddData)(
-        handle, inputType, data, size, name)
-
-
-cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
-    global __nvJitLinkAddFile
-    _check_or_init_nvJitLink()
-    if __nvJitLinkAddFile == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const char*) nogil>__nvJitLinkAddFile)(
-        handle, inputType, fileName)
-
-
-cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
-    global __nvJitLinkComplete
-    _check_or_init_nvJitLink()
-    if __nvJitLinkComplete == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkComplete is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle) nogil>__nvJitLinkComplete)(
-        handle)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetLinkedCubinSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedCubinSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedCubinSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
-    global __nvJitLinkGetLinkedCubin
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedCubin == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) nogil>__nvJitLinkGetLinkedCubin)(
-        handle, cubin)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetLinkedPtxSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedPtxSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedPtxSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
-    global __nvJitLinkGetLinkedPtx
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedPtx == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetLinkedPtx)(
-        handle, ptx)
-
-
-cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetErrorLogSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetErrorLogSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetErrorLogSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
-    global __nvJitLinkGetErrorLog
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetErrorLog == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetErrorLog)(
-        handle, log)
-
-
-cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetInfoLogSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetInfoLogSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetInfoLogSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
-    global __nvJitLinkGetInfoLog
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetInfoLog == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
-        handle, log)
diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx b/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
deleted file mode 100644
index 8856b59ca..000000000
--- a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
+++ /dev/null
@@ -1,393 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
-
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_nvJitLink_dso_version_suffix
-
-import os
-import site
-
-import win32api
-
-from .utils import FunctionNotFoundError, NotSupportedError
-
-
-###############################################################################
-# Wrapper init
-###############################################################################
-
-LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
-LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
-LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
-cdef bint __py_nvJitLink_init = False
-cdef void* __cuDriverGetVersion = NULL
-
-cdef void* __nvJitLinkCreate = NULL
-cdef void* __nvJitLinkDestroy = NULL
-cdef void* __nvJitLinkAddData = NULL
-cdef void* __nvJitLinkAddFile = NULL
-cdef void* __nvJitLinkComplete = NULL
-cdef void* __nvJitLinkGetLinkedCubinSize = NULL
-cdef void* __nvJitLinkGetLinkedCubin = NULL
-cdef void* __nvJitLinkGetLinkedPtxSize = NULL
-cdef void* __nvJitLinkGetLinkedPtx = NULL
-cdef void* __nvJitLinkGetErrorLogSize = NULL
-cdef void* __nvJitLinkGetErrorLog = NULL
-cdef void* __nvJitLinkGetInfoLogSize = NULL
-cdef void* __nvJitLinkGetInfoLog = NULL
-
-
-cdef inline list get_site_packages():
-    return [site.getusersitepackages()] + site.getsitepackages()
-
-
-cdef load_library(const int driver_ver):
-    handle = 0
-
-    for suffix in get_nvJitLink_dso_version_suffix(driver_ver):
-        if len(suffix) == 0:
-            continue
-        dll_name = f"nvJitLink64_{suffix}.dll"
-
-        # First check if the DLL has been loaded by 3rd parties
-        try:
-            handle = win32api.GetModuleHandle(dll_name)
-        except:
-            pass
-        else:
-            break
-
-        # Next, check if DLLs are installed via pip
-        for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "nvJitLink", "bin")
-            if not os.path.isdir(mod_path):
-                continue
-            os.add_dll_directory(mod_path)
-        try:
-            handle = win32api.LoadLibraryEx(
-                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
-                os.path.join(mod_path, dll_name),
-                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
-        except:
-            pass
-        else:
-            break
-
-        # Finally, try default search
-        try:
-            handle = win32api.LoadLibrary(dll_name)
-        except:
-            pass
-        else:
-            break
-    else:
-        raise RuntimeError('Failed to load nvJitLink')
-
-    assert handle != 0
-    return handle
-
-
-cdef int _check_or_init_nvJitLink() except -1 nogil:
-    global __py_nvJitLink_init
-    if __py_nvJitLink_init:
-        return 0
-
-    cdef int err, driver_ver
-    with gil:
-        # Load driver to check version
-        try:
-            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
-        except Exception as e:
-            raise NotSupportedError(f'CUDA driver is not found ({e})')
-        global __cuDriverGetVersion
-        if __cuDriverGetVersion == NULL:
-            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
-            if __cuDriverGetVersion == NULL:
-                raise RuntimeError('something went wrong')
-        err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
-        if err != 0:
-            raise RuntimeError('something went wrong')
-
-        # Load library
-        handle = load_library(driver_ver)
-
-        # Load function
-        global __nvJitLinkCreate
-        try:
-            __nvJitLinkCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkCreate')
-        except:
-            pass
-    
-        global __nvJitLinkDestroy
-        try:
-            __nvJitLinkDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkDestroy')
-        except:
-            pass
-    
-        global __nvJitLinkAddData
-        try:
-            __nvJitLinkAddData = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkAddData')
-        except:
-            pass
-    
-        global __nvJitLinkAddFile
-        try:
-            __nvJitLinkAddFile = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkAddFile')
-        except:
-            pass
-    
-        global __nvJitLinkComplete
-        try:
-            __nvJitLinkComplete = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkComplete')
-        except:
-            pass
-    
-        global __nvJitLinkGetLinkedCubinSize
-        try:
-            __nvJitLinkGetLinkedCubinSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedCubinSize')
-        except:
-            pass
-    
-        global __nvJitLinkGetLinkedCubin
-        try:
-            __nvJitLinkGetLinkedCubin = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedCubin')
-        except:
-            pass
-    
-        global __nvJitLinkGetLinkedPtxSize
-        try:
-            __nvJitLinkGetLinkedPtxSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedPtxSize')
-        except:
-            pass
-    
-        global __nvJitLinkGetLinkedPtx
-        try:
-            __nvJitLinkGetLinkedPtx = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedPtx')
-        except:
-            pass
-    
-        global __nvJitLinkGetErrorLogSize
-        try:
-            __nvJitLinkGetErrorLogSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetErrorLogSize')
-        except:
-            pass
-    
-        global __nvJitLinkGetErrorLog
-        try:
-            __nvJitLinkGetErrorLog = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetErrorLog')
-        except:
-            pass
-    
-        global __nvJitLinkGetInfoLogSize
-        try:
-            __nvJitLinkGetInfoLogSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetInfoLogSize')
-        except:
-            pass
-    
-        global __nvJitLinkGetInfoLog
-        try:
-            __nvJitLinkGetInfoLog = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetInfoLog')
-        except:
-            pass
-
-    __py_nvJitLink_init = True
-    return 0
-
-
-cdef dict func_ptrs = None
-
-
-cpdef dict _inspect_function_pointers():
-    global func_ptrs
-    if func_ptrs is not None:
-        return func_ptrs
-
-    _check_or_init_nvJitLink()
-    cdef dict data = {}
-
-    global __nvJitLinkCreate
-    data["__nvJitLinkCreate"] = <intptr_t>__nvJitLinkCreate
-    
-    global __nvJitLinkDestroy
-    data["__nvJitLinkDestroy"] = <intptr_t>__nvJitLinkDestroy
-    
-    global __nvJitLinkAddData
-    data["__nvJitLinkAddData"] = <intptr_t>__nvJitLinkAddData
-    
-    global __nvJitLinkAddFile
-    data["__nvJitLinkAddFile"] = <intptr_t>__nvJitLinkAddFile
-    
-    global __nvJitLinkComplete
-    data["__nvJitLinkComplete"] = <intptr_t>__nvJitLinkComplete
-    
-    global __nvJitLinkGetLinkedCubinSize
-    data["__nvJitLinkGetLinkedCubinSize"] = <intptr_t>__nvJitLinkGetLinkedCubinSize
-    
-    global __nvJitLinkGetLinkedCubin
-    data["__nvJitLinkGetLinkedCubin"] = <intptr_t>__nvJitLinkGetLinkedCubin
-    
-    global __nvJitLinkGetLinkedPtxSize
-    data["__nvJitLinkGetLinkedPtxSize"] = <intptr_t>__nvJitLinkGetLinkedPtxSize
-    
-    global __nvJitLinkGetLinkedPtx
-    data["__nvJitLinkGetLinkedPtx"] = <intptr_t>__nvJitLinkGetLinkedPtx
-    
-    global __nvJitLinkGetErrorLogSize
-    data["__nvJitLinkGetErrorLogSize"] = <intptr_t>__nvJitLinkGetErrorLogSize
-    
-    global __nvJitLinkGetErrorLog
-    data["__nvJitLinkGetErrorLog"] = <intptr_t>__nvJitLinkGetErrorLog
-    
-    global __nvJitLinkGetInfoLogSize
-    data["__nvJitLinkGetInfoLogSize"] = <intptr_t>__nvJitLinkGetInfoLogSize
-    
-    global __nvJitLinkGetInfoLog
-    data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
-
-    func_ptrs = data
-    return data
-
-
-cpdef _inspect_function_pointer(str name):
-    global func_ptrs
-    if func_ptrs is None:
-        func_ptrs = _inspect_function_pointers()
-    return func_ptrs[name]
-
-
-###############################################################################
-# Wrapper functions
-###############################################################################
-
-cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
-    global __nvJitLinkCreate
-    _check_or_init_nvJitLink()
-    if __nvJitLinkCreate == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkCreate is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle*, uint32_t, const char**) nogil>__nvJitLinkCreate)(
-        handle, numOptions, options)
-
-
-cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
-    global __nvJitLinkDestroy
-    _check_or_init_nvJitLink()
-    if __nvJitLinkDestroy == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle*) nogil>__nvJitLinkDestroy)(
-        handle)
-
-
-cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
-    global __nvJitLinkAddData
-    _check_or_init_nvJitLink()
-    if __nvJitLinkAddData == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkAddData is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const void*, size_t, const char*) nogil>__nvJitLinkAddData)(
-        handle, inputType, data, size, name)
-
-
-cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
-    global __nvJitLinkAddFile
-    _check_or_init_nvJitLink()
-    if __nvJitLinkAddFile == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const char*) nogil>__nvJitLinkAddFile)(
-        handle, inputType, fileName)
-
-
-cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
-    global __nvJitLinkComplete
-    _check_or_init_nvJitLink()
-    if __nvJitLinkComplete == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkComplete is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle) nogil>__nvJitLinkComplete)(
-        handle)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetLinkedCubinSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedCubinSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedCubinSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
-    global __nvJitLinkGetLinkedCubin
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedCubin == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) nogil>__nvJitLinkGetLinkedCubin)(
-        handle, cubin)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetLinkedPtxSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedPtxSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedPtxSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
-    global __nvJitLinkGetLinkedPtx
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedPtx == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetLinkedPtx)(
-        handle, ptx)
-
-
-cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetErrorLogSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetErrorLogSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetErrorLogSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
-    global __nvJitLinkGetErrorLog
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetErrorLog == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetErrorLog)(
-        handle, log)
-
-
-cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetInfoLogSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetInfoLogSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetInfoLogSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
-    global __nvJitLinkGetInfoLog
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetInfoLog == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
-        handle, log)
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index 5cac180f3..19ea34ee1 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -6,14 +6,22 @@
 
 from libc.stdint cimport intptr_t
 
+<<<<<<< HEAD
 from utils cimport get_nvjitlink_dso_version_suffix
+=======
+from .utils cimport get_nvjitlink_dso_version_suffix
+>>>>>>> 5d60eb1 (more changes)
 
 import os
 import site
 
 import win32api
 
+<<<<<<< HEAD
 from utils import FunctionNotFoundError, NotSupportedError
+=======
+from .utils import FunctionNotFoundError, NotSupportedError
+>>>>>>> 5d60eb1 (more changes)
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pxd b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
index 3dcc1d4ec..250153ece 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
@@ -5,7 +5,11 @@
 # This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
 
 
+<<<<<<< HEAD
 from libc.stdint cimport uint32_t
+=======
+from libc.stdint cimport intptr_t, uint32_t
+>>>>>>> 5d60eb1 (more changes)
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pyx b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
index 5e882524e..d4acbd606 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
@@ -5,7 +5,10 @@
 # This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
 
 from ._internal cimport nvjitlink as _nvjitlink
+<<<<<<< HEAD
 from libc.stdint cimport uint32_t
+=======
+>>>>>>> 5d60eb1 (more changes)
 
 
 ###############################################################################
diff --git a/cuda_bindings/cynvJitLink.pxd b/cuda_bindings/cynvJitLink.pxd
deleted file mode 100644
index ed440c0b3..000000000
--- a/cuda_bindings/cynvJitLink.pxd
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
-
-
-from libc.stdint cimport int64_t
-
-
-###############################################################################
-# Types (structs, enums, ...)
-###############################################################################
-
-# enums
-
-
-
-# types
-cdef extern from *:
-    """
-    #include <driver_types.h>
-    #include <library_types.h>
-    #include <cuComplex.h>
-    """
-    ctypedef void* cudaStream_t 'cudaStream_t'
-
-
-
-
-
-###############################################################################
-# Functions
-###############################################################################
-
-cdef nvJitLinkResult nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil
-cdef nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil
-cdef nvJitLinkResult nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil
-cdef nvJitLinkResult nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil
-cdef nvJitLinkResult nvJitLinkComplete(nvJitLinkHandle handle) except* nogil
-cdef nvJitLinkResult nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil
-cdef nvJitLinkResult nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil
-cdef nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
-cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
diff --git a/cuda_bindings/cynvJitLink.pyx b/cuda_bindings/cynvJitLink.pyx
deleted file mode 100644
index 65d3f9840..000000000
--- a/cuda_bindings/cynvJitLink.pyx
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
-
-from ._internal cimport nvJitLink as _nvJitLink
-
-
-###############################################################################
-# Wrapper functions
-###############################################################################
-
-cdef nvJitLinkResult nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
-    return _nvJitLink._nvJitLinkCreate(handle, numOptions, options)
-
-
-cdef nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
-    return _nvJitLink._nvJitLinkDestroy(handle)
-
-
-cdef nvJitLinkResult nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
-    return _nvJitLink._nvJitLinkAddData(handle, inputType, data, size, name)
-
-
-cdef nvJitLinkResult nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
-    return _nvJitLink._nvJitLinkAddFile(handle, inputType, fileName)
-
-
-cdef nvJitLinkResult nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
-    return _nvJitLink._nvJitLinkComplete(handle)
-
-
-cdef nvJitLinkResult nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    return _nvJitLink._nvJitLinkGetLinkedCubinSize(handle, size)
-
-
-cdef nvJitLinkResult nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
-    return _nvJitLink._nvJitLinkGetLinkedCubin(handle, cubin)
-
-
-cdef nvJitLinkResult nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    return _nvJitLink._nvJitLinkGetLinkedPtxSize(handle, size)
-
-
-cdef nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
-    return _nvJitLink._nvJitLinkGetLinkedPtx(handle, ptx)
-
-
-cdef nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    return _nvJitLink._nvJitLinkGetErrorLogSize(handle, size)
-
-
-cdef nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
-    return _nvJitLink._nvJitLinkGetErrorLog(handle, log)
-
-
-cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    return _nvJitLink._nvJitLinkGetInfoLogSize(handle, size)
-
-
-cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
-    return _nvJitLink._nvJitLinkGetInfoLog(handle, log)
diff --git a/cuda_bindings/nvJitLink.pxd b/cuda_bindings/nvJitLink.pxd
deleted file mode 100644
index d063002be..000000000
--- a/cuda_bindings/nvJitLink.pxd
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
-
-from libc.stdint cimport intptr_t
-
-from .cynvJitLink cimport *
-
-
-###############################################################################
-# Types
-###############################################################################
-
-
-
-ctypedef cudaStream_t Stream
-ctypedef cudaDataType DataType
-ctypedef libraryPropertyType_t LibraryPropertyType
-
-
-###############################################################################
-# Enum
-###############################################################################
-
-
-
-
-###############################################################################
-# Functions
-###############################################################################
-
-cpdef create(intptr_t handle, uint32_t num_options, intptr_t options)
-cpdef destroy(intptr_t handle)
-cpdef add_data(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t data, size_t size, intptr_t name)
-cpdef add_file(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t file_name)
-cpdef complete(nvJitLinkHandle handle)
-cpdef get_linked_cubin_size(nvJitLinkHandle handle, intptr_t size)
-cpdef get_linked_cubin(nvJitLinkHandle handle, intptr_t cubin)
-cpdef get_linked_ptx_size(nvJitLinkHandle handle, intptr_t size)
-cpdef get_linked_ptx(nvJitLinkHandle handle, intptr_t ptx)
-cpdef get_error_log_size(nvJitLinkHandle handle, intptr_t size)
-cpdef get_error_log(nvJitLinkHandle handle, intptr_t log)
-cpdef get_info_log_size(nvJitLinkHandle handle, intptr_t size)
-cpdef get_info_log(nvJitLinkHandle handle, intptr_t log)
diff --git a/cuda_bindings/nvJitLink.pyx b/cuda_bindings/nvJitLink.pyx
deleted file mode 100644
index 18f4c7545..000000000
--- a/cuda_bindings/nvJitLink.pyx
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
-
-cimport cython  # NOQA
-
-from enum import IntEnum as _IntEnum
-
-
-###############################################################################
-# Enum
-###############################################################################
-
-
-
-
-###############################################################################
-# Error handling
-###############################################################################
-
-cdef dict STATUS={
-    NVJITLINK_SUCCESS                   : 'NVJITLINK_SUCCESS',
-    NVJITLINK_ERROR_UNRECOGNIZED_OPTION : 'NVJITLINK_ERROR_UNRECOGNIZED_OPTION',
-    NVJITLINK_ERROR_MISSING_ARCH        : 'NVJITLINK_ERROR_MISSING_ARCH', // -arch=sm_NN option not specified
-    NVJITLINK_ERROR_INVALID_INPUT       : 'NVJITLINK_ERROR_INVALID_INPUT',
-    NVJITLINK_ERROR_PTX_COMPILE         : 'NVJITLINK_ERROR_PTX_COMPILE',
-    NVJITLINK_ERROR_NVVM_COMPILE        : 'NVJITLINK_ERROR_NVVM_COMPILE',
-    NVJITLINK_ERROR_INTERNAL            : 'NVJITLINK_ERROR_INTERNAL',
-    NVJITLINK_ERROR_THREADPOOL          : 'NVJITLINK_ERROR_THREADPOOL',
-    NVJITLINK_ERROR_UNRECOGNIZED_INPUT  : 'NVJITLINK_ERROR_UNRECOGNIZED_INPUT',
-    NVJITLINK_ERROR_NULL_INPUT          : 'NVJITLINK_ERROR_NULL_INPUT',
-    NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS: 'NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS',
-    NVJITLINK_ERROR_INCORRECT_INPUT_TYPE: 'NVJITLINK_ERROR_INCORRECT_INPUT_TYPE',
-    NVJITLINK_ERROR_ARCH_MISMATCH       : 'NVJITLINK_ERROR_ARCH_MISMATCH',
-    NVJITLINK_ERROR_OUTDATED_LIBRARY    : 'NVJITLINK_ERROR_OUTDATED_LIBRARY',
-    NVJITLINK_ERROR_MISSING_FATBIN      : 'NVJITLINK_ERROR_MISSING_FATBIN'
-}
-
-class nvJitLinkError(Exception):
-
-    def __init__(self, status):
-        self.status = status
-        cdef str err = STATUS[status]
-        super(nvJitLinkError, self).__init__(err)
-
-    def __reduce__(self):
-        return (type(self), (self.status,))
-
-
-@cython.profile(False)
-cdef inline void check_status(int status) nogil:
-    if status != 0:
-        with gil:
-            raise nvJitLinkError(status)
-
-
-###############################################################################
-# Wrapper functions
-###############################################################################
-
-cpdef create(intptr_t handle, uint32_t num_options, intptr_t options):
-    with nogil:
-        status = nvJitLinkCreate(<nvJitLinkHandle*>handle, num_options, <const char**>options)
-        _check_status(status)
-
-
-cpdef destroy(intptr_t handle):
-    with nogil:
-        status = nvJitLinkDestroy(<nvJitLinkHandle*>handle)
-        _check_status(status)
-
-
-cpdef add_data(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t data, size_t size, intptr_t name):
-    with nogil:
-        status = nvJitLinkAddData(handle, input_type, <const void*>data, size, <const char*>name)
-        _check_status(status)
-
-
-cpdef add_file(nvJitLinkHandle handle, nvJitLinkInputType input_type, intptr_t file_name):
-    with nogil:
-        status = nvJitLinkAddFile(handle, input_type, <const char*>file_name)
-        _check_status(status)
-
-
-cpdef complete(nvJitLinkHandle handle):
-    with nogil:
-        status = nvJitLinkComplete(handle)
-        _check_status(status)
-
-
-cpdef get_linked_cubin_size(nvJitLinkHandle handle, intptr_t size):
-    with nogil:
-        status = nvJitLinkGetLinkedCubinSize(handle, <size_t*>size)
-        _check_status(status)
-
-
-cpdef get_linked_cubin(nvJitLinkHandle handle, intptr_t cubin):
-    with nogil:
-        status = nvJitLinkGetLinkedCubin(handle, <void*>cubin)
-        _check_status(status)
-
-
-cpdef get_linked_ptx_size(nvJitLinkHandle handle, intptr_t size):
-    with nogil:
-        status = nvJitLinkGetLinkedPtxSize(handle, <size_t*>size)
-        _check_status(status)
-
-
-cpdef get_linked_ptx(nvJitLinkHandle handle, intptr_t ptx):
-    with nogil:
-        status = nvJitLinkGetLinkedPtx(handle, <char*>ptx)
-        _check_status(status)
-
-
-cpdef get_error_log_size(nvJitLinkHandle handle, intptr_t size):
-    with nogil:
-        status = nvJitLinkGetErrorLogSize(handle, <size_t*>size)
-        _check_status(status)
-
-
-cpdef get_error_log(nvJitLinkHandle handle, intptr_t log):
-    with nogil:
-        status = nvJitLinkGetErrorLog(handle, <char*>log)
-        _check_status(status)
-
-
-cpdef get_info_log_size(nvJitLinkHandle handle, intptr_t size):
-    with nogil:
-        status = nvJitLinkGetInfoLogSize(handle, <size_t*>size)
-        _check_status(status)
-
-
-cpdef get_info_log(nvJitLinkHandle handle, intptr_t log):
-    with nogil:
-        status = nvJitLinkGetInfoLog(handle, <char*>log)
-        _check_status(status)
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 8ffb50d63..4bfc57f19 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -231,7 +231,6 @@ def rename_architecture_specific_files(path):
 
 @atexit.register
 def cleanup_dst_files():
-    pass
     for dst in architechture_specific_files_dir:
         try:
             os.remove(dst)
@@ -240,6 +239,7 @@ def cleanup_dst_files():
         
 architechture_specific_files_dir = 'cuda/bindings/_internal/'
 
+
 def do_cythonize(extensions):
     return cythonize(
         extensions,
@@ -267,14 +267,19 @@ def do_cythonize(extensions):
     # interal files used by cybind. We on
     ['cuda/bindings/_internal/nvjitlink.pyx'],
     ['cuda/bindings/_internal/utils.pyx'],
-
 ]
 
 
 
+rename_architecture_specific_files()
+
 for sources in sources_list:
     extensions += prep_extensions(sources)
 
+# for sources in new_sources_list:
+#     new_extensions += prep_extensions(sources)
+
+
 # ---------------------------------------------------------------------
 # Custom build_ext command
 # Files are build in two steps:
@@ -297,6 +302,11 @@ def finalize_options(self):
 # ----------------------------------------------------------------------
 # Setup
 
+package_data=dict.fromkeys(
+        find_packages(include=["cuda.cuda", "cuda.cuda.*", "cuda.cuda.bindings", "cuda.cuda.bindings._bindings", "cuda.cuda.bindings._lib", "cuda.cuda.bindings._lib.cyruntime", "cuda.cuda.bindings._internal", "tests"]),
+        ["*.pxd", "*.pyx", "*.py", "*.h", "*.cpp"],
+    )
+
 setup(
     version=versioneer.get_version(),
     ext_modules=do_cythonize(extensions),
diff --git a/cuda_bindings/tests/test_nvJitLink.py b/cuda_bindings/tests/test_nvJitLink.py
deleted file mode 100644
index f566ae7c6..000000000
--- a/cuda_bindings/tests/test_nvJitLink.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import pytest
-from cuda import nvJitLink
-
-def test_create_no_arch_error():
-    # nvjitlink expects at least the architecture to be specified.
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_MISSING_ARCH error"):
-        nvJitLink.create()
-
-
-def test_invalid_arch_error():
-    # sm_XX is not a valid architecture
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
-        nvJitLink.create("-arch=sm_XX")
-
-
-def test_unrecognized_option_error():
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
-        nvJitLink.create("-fictitious_option")
-
-
-def test_invalid_option_type_error():
-    with pytest.raises(TypeError, match="Expecting only strings"):
-        nvJitLink.create("-arch", 53)
-
-
-def test_create_and_destroy():
-    handle = nvJitLink.create("-arch=sm_53")
-    assert handle != 0
-    nvJitLink.destroy(handle)
-
-
-def test_complete_empty():
-    handle = nvJitLink.create("-arch=sm_75")
-    nvJitLink.complete(handle)
-    nvJitLink.destroy(handle)
-
-
-@pytest.mark.parametrize(
-    "input_file,input_type",
-    [
-        ("device_functions_cubin", nvJitLink.InputType.CUBIN),
-        ("device_functions_fatbin", InputType.FATBIN),
-        ("device_functions_ptx", InputType.PTX),
-        ("device_functions_object", InputType.OBJECT),
-        ("device_functions_archive", InputType.LIBRARY),
-    ],
-)
-def test_add_file(input_file, input_type, gpu_arch_flag, request):
-    filename, data = request.getfixturevalue(input_file)
-
-    handle = nvJitLink.create(gpu_arch_flag)
-    nvJitLink.add_data(handle, input_type.value, data, filename)
-    nvJitLink.destroy(handle)
-
-
-# We test the LTO input case separately as it requires the `-lto` flag. The
-# OBJECT input type is used because the LTO-IR container is packaged in an ELF
-# object when produced by NVCC.
-def test_add_file_lto(device_functions_ltoir_object, gpu_arch_flag):
-    filename, data = device_functions_ltoir_object
-
-    handle = nvJitLink.create(gpu_arch_flag, "-lto")
-    nvJitLink.add_data(handle, InputType.OBJECT.value, data, filename)
-    nvJitLink.destroy(handle)
-
-
-def test_get_error_log(undefined_extern_cubin, gpu_arch_flag):
-    handle = nvJitLink.create(gpu_arch_flag)
-    filename, data = undefined_extern_cubin
-    input_type = InputType.CUBIN.value
-    nvJitLink.add_data(handle, input_type, data, filename)
-    with pytest.raises(RuntimeError):
-        nvJitLink.complete(handle)
-    error_log = nvJitLink.get_error_log(handle)
-    nvJitLink.destroy(handle)
-    assert (
-        "Undefined reference to '_Z5undefff' "
-        "in 'undefined_extern.cubin'" in error_log
-    )
-
-
-def test_get_info_log(device_functions_cubin, gpu_arch_flag):
-    handle = nvJitLink.create(gpu_arch_flag)
-    filename, data = device_functions_cubin
-    input_type = InputType.CUBIN.value
-    nvJitLink.add_data(handle, input_type, data, filename)
-    nvJitLink.complete(handle)
-    info_log = nvJitLink.get_info_log(handle)
-    nvJitLink.destroy(handle)
-    # Info log is empty
-    assert "" == info_log
-
-
-def test_get_linked_cubin(device_functions_cubin, gpu_arch_flag):
-    handle = nvJitLink.create(gpu_arch_flag)
-    filename, data = device_functions_cubin
-    input_type = InputType.CUBIN.value
-    nvJitLink.add_data(handle, input_type, data, filename)
-    nvJitLink.complete(handle)
-    cubin = nvJitLink.get_linked_cubin(handle)
-    nvJitLink.destroy(handle)
-
-    # Just check we got something that looks like an ELF
-    assert cubin[:4] == b"\x7fELF"
-
-
-def test_get_linked_cubin_link_not_complete_error(
-    device_functions_cubin, gpu_arch_flag
-):
-    handle = nvJitLink.create(gpu_arch_flag)
-    filename, data = device_functions_cubin
-    input_type = InputType.CUBIN.value
-    nvJitLink.add_data(handle, input_type, data, filename)
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
-        nvJitLink.get_linked_cubin(handle)
-    nvJitLink.destroy(handle)
-
-
-def test_get_linked_cubin_from_lto(device_functions_ltoir_object, gpu_arch_flag):
-    filename, data = device_functions_ltoir_object
-    # device_functions_ltoir_object is a host object containing a fatbin
-    # containing an LTOIR container, because that is what NVCC produces when
-    # LTO is requested. So we need to use the OBJECT input type, and the linker
-    # retrieves the LTO IR from it because we passed the -lto flag.
-    input_type = InputType.OBJECT.value
-    handle = nvJitLink.create(gpu_arch_flag, "-lto")
-    nvJitLink.add_data(handle, input_type, data, filename)
-    nvJitLink.complete(handle)
-    cubin = nvJitLink.get_linked_cubin(handle)
-    nvJitLink.destroy(handle)
-
-    # Just check we got something that looks like an ELF
-    assert cubin[:4] == b"\x7fELF"
-
-
-def test_get_linked_ptx_from_lto(device_functions_ltoir_object, gpu_arch_flag):
-    filename, data = device_functions_ltoir_object
-    # device_functions_ltoir_object is a host object containing a fatbin
-    # containing an LTOIR container, because that is what NVCC produces when
-    # LTO is requested. So we need to use the OBJECT input type, and the linker
-    # retrieves the LTO IR from it because we passed the -lto flag.
-    input_type = InputType.OBJECT.value
-    handle = nvJitLink.create(gpu_arch_flag, "-lto", "-ptx")
-    nvJitLink.add_data(handle, input_type, data, filename)
-    nvJitLink.complete(handle)
-    nvJitLink.get_linked_ptx(handle)
-    nvJitLink.destroy(handle)
-
-
-def test_get_linked_ptx_link_not_complete_error(
-    device_functions_ltoir_object, gpu_arch_flag
-):
-    handle = nvJitLink.create(gpu_arch_flag, "-lto", "-ptx")
-    filename, data = device_functions_ltoir_object
-    input_type = InputType.OBJECT.value
-    nvJitLink.add_data(handle, input_type, data, filename)
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
-        nvJitLink.get_linked_ptx(handle)
-    nvJitLink.destroy(handle)
-
-
-def test_package_version():
-    assert pynvjitlink.__version__ is not None
-    assert len(str(pynvjitlink.__version__)) > 0
\ No newline at end of file

From 238736c881cffeadaad0a0e3552fdb1ced18c596 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 16 Oct 2024 16:58:53 -0700
Subject: [PATCH 10/34] rebase merge

---
 .../cuda/bindings/_internal/nvjitlink_windows.pyx  |  4 ++--
 cuda_bindings/cuda/bindings/cynvjitlink.pxd        |  4 ++++
 cuda_bindings/cuda/bindings/cynvjitlink.pyx        |  4 ++++
 cuda_bindings/setup.py                             | 14 --------------
 4 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index 43852441e..5cac180f3 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -6,14 +6,14 @@
 
 from libc.stdint cimport intptr_t
 
-from .utils cimport get_nvjitlink_dso_version_suffix
+from utils cimport get_nvjitlink_dso_version_suffix
 
 import os
 import site
 
 import win32api
 
-from .utils import FunctionNotFoundError, NotSupportedError
+from utils import FunctionNotFoundError, NotSupportedError
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pxd b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
index 250153ece..3f4134706 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
@@ -5,11 +5,15 @@
 # This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
 
 
+<<<<<<< HEAD
 <<<<<<< HEAD
 from libc.stdint cimport uint32_t
 =======
 from libc.stdint cimport intptr_t, uint32_t
 >>>>>>> 5d60eb1 (more changes)
+=======
+from libc.stdint cimport uint32_t
+>>>>>>> 8c4029f (working)
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pyx b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
index d4acbd606..c91948f03 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
@@ -6,9 +6,13 @@
 
 from ._internal cimport nvjitlink as _nvjitlink
 <<<<<<< HEAD
+<<<<<<< HEAD
 from libc.stdint cimport uint32_t
 =======
 >>>>>>> 5d60eb1 (more changes)
+=======
+from libc.stdint cimport uint32_t
+>>>>>>> 8c4029f (working)
 
 
 ###############################################################################
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 4bfc57f19..f0aaee771 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -239,7 +239,6 @@ def cleanup_dst_files():
         
 architechture_specific_files_dir = 'cuda/bindings/_internal/'
 
-
 def do_cythonize(extensions):
     return cythonize(
         extensions,
@@ -269,17 +268,9 @@ def do_cythonize(extensions):
     ['cuda/bindings/_internal/utils.pyx'],
 ]
 
-
-
-rename_architecture_specific_files()
-
 for sources in sources_list:
     extensions += prep_extensions(sources)
 
-# for sources in new_sources_list:
-#     new_extensions += prep_extensions(sources)
-
-
 # ---------------------------------------------------------------------
 # Custom build_ext command
 # Files are build in two steps:
@@ -302,11 +293,6 @@ def finalize_options(self):
 # ----------------------------------------------------------------------
 # Setup
 
-package_data=dict.fromkeys(
-        find_packages(include=["cuda.cuda", "cuda.cuda.*", "cuda.cuda.bindings", "cuda.cuda.bindings._bindings", "cuda.cuda.bindings._lib", "cuda.cuda.bindings._lib.cyruntime", "cuda.cuda.bindings._internal", "tests"]),
-        ["*.pxd", "*.pyx", "*.py", "*.h", "*.cpp"],
-    )
-
 setup(
     version=versioneer.get_version(),
     ext_modules=do_cythonize(extensions),

From d4bd29c550ca35fcf5674ae0595ac3b44e6e767e Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 16 Oct 2024 17:10:00 -0700
Subject: [PATCH 11/34] cleaning up merge

---
 .../cuda/bindings/_bindings/nvJitLink.pxd     |  26 --
 .../bindings/_bindings/nvJitLink_linux.pyx    | 382 -----------------
 .../bindings/_bindings/nvJitLink_windows.pyx  | 393 ------------------
 .../cuda/bindings/_internal/nvjitlink.pyx     | 382 -----------------
 cuda_bindings/setup.py                        |   8 +-
 cuda_bindings/tests/test_nvJitLink.py         | 164 --------
 6 files changed, 7 insertions(+), 1348 deletions(-)
 delete mode 100644 cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
 delete mode 100644 cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
 delete mode 100644 cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
 delete mode 100644 cuda_bindings/cuda/bindings/_internal/nvjitlink.pyx
 delete mode 100644 cuda_bindings/tests/test_nvJitLink.py

diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd b/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
deleted file mode 100644
index dca128a0e..000000000
--- a/cuda_bindings/cuda/bindings/_bindings/nvJitLink.pxd
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
-
-from ..cynvJitLink cimport *
-
-
-###############################################################################
-# Wrapper functions
-###############################################################################
-
-cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil
-cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil
-cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil
-cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil
-cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx b/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
deleted file mode 100644
index 2fc6ca625..000000000
--- a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_linux.pyx
+++ /dev/null
@@ -1,382 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
-
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_nvJitLink_dso_version_suffix
-
-from .utils import FunctionNotFoundError, NotSupportedError
-
-
-###############################################################################
-# Extern
-###############################################################################
-
-cdef extern from "<dlfcn.h>" nogil:
-    void* dlopen(const char*, int)
-    char* dlerror()
-    void* dlsym(void*, const char*)
-    int dlclose(void*)
-
-    enum:
-        RTLD_LAZY
-        RTLD_NOW
-        RTLD_GLOBAL
-        RTLD_LOCAL
-
-    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
-
-
-###############################################################################
-# Wrapper init
-###############################################################################
-
-cdef bint __py_nvJitLink_init = False
-cdef void* __cuDriverGetVersion = NULL
-
-cdef void* __nvJitLinkCreate = NULL
-cdef void* __nvJitLinkDestroy = NULL
-cdef void* __nvJitLinkAddData = NULL
-cdef void* __nvJitLinkAddFile = NULL
-cdef void* __nvJitLinkComplete = NULL
-cdef void* __nvJitLinkGetLinkedCubinSize = NULL
-cdef void* __nvJitLinkGetLinkedCubin = NULL
-cdef void* __nvJitLinkGetLinkedPtxSize = NULL
-cdef void* __nvJitLinkGetLinkedPtx = NULL
-cdef void* __nvJitLinkGetErrorLogSize = NULL
-cdef void* __nvJitLinkGetErrorLog = NULL
-cdef void* __nvJitLinkGetInfoLogSize = NULL
-cdef void* __nvJitLinkGetInfoLog = NULL
-
-
-cdef void* load_library(const int driver_ver) except* with gil:
-    cdef void* handle
-    for suffix in get_nvJitLink_dso_version_suffix(driver_ver):
-        so_name = "libnvJitLink.so" + (f".{suffix}" if suffix else suffix)
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libnvJitLink ({err_msg.decode()})')
-    return handle
-
-
-cdef int _check_or_init_nvJitLink() except -1 nogil:
-    global __py_nvJitLink_init
-    if __py_nvJitLink_init:
-        return 0
-
-    # Load driver to check version
-    cdef void* handle = NULL
-    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
-    if handle == NULL:
-        with gil:
-            err_msg = dlerror()
-            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
-    global __cuDriverGetVersion
-    if __cuDriverGetVersion == NULL:
-        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
-    if __cuDriverGetVersion == NULL:
-        with gil:
-            raise RuntimeError('something went wrong')
-    cdef int err, driver_ver
-    err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
-    if err != 0:
-        with gil:
-            raise RuntimeError('something went wrong')
-    #dlclose(handle)
-    handle = NULL
-
-    # Load function
-    global __nvJitLinkCreate
-    __nvJitLinkCreate = dlsym(RTLD_DEFAULT, 'nvJitLinkCreate')
-    if __nvJitLinkCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkCreate = dlsym(handle, 'nvJitLinkCreate')
-    
-    global __nvJitLinkDestroy
-    __nvJitLinkDestroy = dlsym(RTLD_DEFAULT, 'nvJitLinkDestroy')
-    if __nvJitLinkDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkDestroy = dlsym(handle, 'nvJitLinkDestroy')
-    
-    global __nvJitLinkAddData
-    __nvJitLinkAddData = dlsym(RTLD_DEFAULT, 'nvJitLinkAddData')
-    if __nvJitLinkAddData == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkAddData = dlsym(handle, 'nvJitLinkAddData')
-    
-    global __nvJitLinkAddFile
-    __nvJitLinkAddFile = dlsym(RTLD_DEFAULT, 'nvJitLinkAddFile')
-    if __nvJitLinkAddFile == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkAddFile = dlsym(handle, 'nvJitLinkAddFile')
-    
-    global __nvJitLinkComplete
-    __nvJitLinkComplete = dlsym(RTLD_DEFAULT, 'nvJitLinkComplete')
-    if __nvJitLinkComplete == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkComplete = dlsym(handle, 'nvJitLinkComplete')
-    
-    global __nvJitLinkGetLinkedCubinSize
-    __nvJitLinkGetLinkedCubinSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubinSize')
-    if __nvJitLinkGetLinkedCubinSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetLinkedCubinSize = dlsym(handle, 'nvJitLinkGetLinkedCubinSize')
-    
-    global __nvJitLinkGetLinkedCubin
-    __nvJitLinkGetLinkedCubin = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubin')
-    if __nvJitLinkGetLinkedCubin == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetLinkedCubin = dlsym(handle, 'nvJitLinkGetLinkedCubin')
-    
-    global __nvJitLinkGetLinkedPtxSize
-    __nvJitLinkGetLinkedPtxSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtxSize')
-    if __nvJitLinkGetLinkedPtxSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetLinkedPtxSize = dlsym(handle, 'nvJitLinkGetLinkedPtxSize')
-    
-    global __nvJitLinkGetLinkedPtx
-    __nvJitLinkGetLinkedPtx = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtx')
-    if __nvJitLinkGetLinkedPtx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetLinkedPtx = dlsym(handle, 'nvJitLinkGetLinkedPtx')
-    
-    global __nvJitLinkGetErrorLogSize
-    __nvJitLinkGetErrorLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLogSize')
-    if __nvJitLinkGetErrorLogSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetErrorLogSize = dlsym(handle, 'nvJitLinkGetErrorLogSize')
-    
-    global __nvJitLinkGetErrorLog
-    __nvJitLinkGetErrorLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLog')
-    if __nvJitLinkGetErrorLog == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetErrorLog = dlsym(handle, 'nvJitLinkGetErrorLog')
-    
-    global __nvJitLinkGetInfoLogSize
-    __nvJitLinkGetInfoLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLogSize')
-    if __nvJitLinkGetInfoLogSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetInfoLogSize = dlsym(handle, 'nvJitLinkGetInfoLogSize')
-    
-    global __nvJitLinkGetInfoLog
-    __nvJitLinkGetInfoLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLog')
-    if __nvJitLinkGetInfoLog == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetInfoLog = dlsym(handle, 'nvJitLinkGetInfoLog')
-
-    __py_nvJitLink_init = True
-    return 0
-
-
-cdef dict func_ptrs = None
-
-
-cpdef dict _inspect_function_pointers():
-    global func_ptrs
-    if func_ptrs is not None:
-        return func_ptrs
-
-    _check_or_init_nvJitLink()
-    cdef dict data = {}
-
-    global __nvJitLinkCreate
-    data["__nvJitLinkCreate"] = <intptr_t>__nvJitLinkCreate
-    
-    global __nvJitLinkDestroy
-    data["__nvJitLinkDestroy"] = <intptr_t>__nvJitLinkDestroy
-    
-    global __nvJitLinkAddData
-    data["__nvJitLinkAddData"] = <intptr_t>__nvJitLinkAddData
-    
-    global __nvJitLinkAddFile
-    data["__nvJitLinkAddFile"] = <intptr_t>__nvJitLinkAddFile
-    
-    global __nvJitLinkComplete
-    data["__nvJitLinkComplete"] = <intptr_t>__nvJitLinkComplete
-    
-    global __nvJitLinkGetLinkedCubinSize
-    data["__nvJitLinkGetLinkedCubinSize"] = <intptr_t>__nvJitLinkGetLinkedCubinSize
-    
-    global __nvJitLinkGetLinkedCubin
-    data["__nvJitLinkGetLinkedCubin"] = <intptr_t>__nvJitLinkGetLinkedCubin
-    
-    global __nvJitLinkGetLinkedPtxSize
-    data["__nvJitLinkGetLinkedPtxSize"] = <intptr_t>__nvJitLinkGetLinkedPtxSize
-    
-    global __nvJitLinkGetLinkedPtx
-    data["__nvJitLinkGetLinkedPtx"] = <intptr_t>__nvJitLinkGetLinkedPtx
-    
-    global __nvJitLinkGetErrorLogSize
-    data["__nvJitLinkGetErrorLogSize"] = <intptr_t>__nvJitLinkGetErrorLogSize
-    
-    global __nvJitLinkGetErrorLog
-    data["__nvJitLinkGetErrorLog"] = <intptr_t>__nvJitLinkGetErrorLog
-    
-    global __nvJitLinkGetInfoLogSize
-    data["__nvJitLinkGetInfoLogSize"] = <intptr_t>__nvJitLinkGetInfoLogSize
-    
-    global __nvJitLinkGetInfoLog
-    data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
-
-    func_ptrs = data
-    return data
-
-
-cpdef _inspect_function_pointer(str name):
-    global func_ptrs
-    if func_ptrs is None:
-        func_ptrs = _inspect_function_pointers()
-    return func_ptrs[name]
-
-
-###############################################################################
-# Wrapper functions
-###############################################################################
-
-cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
-    global __nvJitLinkCreate
-    _check_or_init_nvJitLink()
-    if __nvJitLinkCreate == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkCreate is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle*, uint32_t, const char**) nogil>__nvJitLinkCreate)(
-        handle, numOptions, options)
-
-
-cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
-    global __nvJitLinkDestroy
-    _check_or_init_nvJitLink()
-    if __nvJitLinkDestroy == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle*) nogil>__nvJitLinkDestroy)(
-        handle)
-
-
-cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
-    global __nvJitLinkAddData
-    _check_or_init_nvJitLink()
-    if __nvJitLinkAddData == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkAddData is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const void*, size_t, const char*) nogil>__nvJitLinkAddData)(
-        handle, inputType, data, size, name)
-
-
-cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
-    global __nvJitLinkAddFile
-    _check_or_init_nvJitLink()
-    if __nvJitLinkAddFile == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const char*) nogil>__nvJitLinkAddFile)(
-        handle, inputType, fileName)
-
-
-cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
-    global __nvJitLinkComplete
-    _check_or_init_nvJitLink()
-    if __nvJitLinkComplete == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkComplete is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle) nogil>__nvJitLinkComplete)(
-        handle)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetLinkedCubinSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedCubinSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedCubinSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
-    global __nvJitLinkGetLinkedCubin
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedCubin == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) nogil>__nvJitLinkGetLinkedCubin)(
-        handle, cubin)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetLinkedPtxSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedPtxSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedPtxSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
-    global __nvJitLinkGetLinkedPtx
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedPtx == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetLinkedPtx)(
-        handle, ptx)
-
-
-cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetErrorLogSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetErrorLogSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetErrorLogSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
-    global __nvJitLinkGetErrorLog
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetErrorLog == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetErrorLog)(
-        handle, log)
-
-
-cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetInfoLogSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetInfoLogSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetInfoLogSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
-    global __nvJitLinkGetInfoLog
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetInfoLog == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
-        handle, log)
diff --git a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx b/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
deleted file mode 100644
index 8856b59ca..000000000
--- a/cuda_bindings/cuda/bindings/_bindings/nvJitLink_windows.pyx
+++ /dev/null
@@ -1,393 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.1 to 12.4.1. Do not modify it directly.
-
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_nvJitLink_dso_version_suffix
-
-import os
-import site
-
-import win32api
-
-from .utils import FunctionNotFoundError, NotSupportedError
-
-
-###############################################################################
-# Wrapper init
-###############################################################################
-
-LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
-LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
-LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
-cdef bint __py_nvJitLink_init = False
-cdef void* __cuDriverGetVersion = NULL
-
-cdef void* __nvJitLinkCreate = NULL
-cdef void* __nvJitLinkDestroy = NULL
-cdef void* __nvJitLinkAddData = NULL
-cdef void* __nvJitLinkAddFile = NULL
-cdef void* __nvJitLinkComplete = NULL
-cdef void* __nvJitLinkGetLinkedCubinSize = NULL
-cdef void* __nvJitLinkGetLinkedCubin = NULL
-cdef void* __nvJitLinkGetLinkedPtxSize = NULL
-cdef void* __nvJitLinkGetLinkedPtx = NULL
-cdef void* __nvJitLinkGetErrorLogSize = NULL
-cdef void* __nvJitLinkGetErrorLog = NULL
-cdef void* __nvJitLinkGetInfoLogSize = NULL
-cdef void* __nvJitLinkGetInfoLog = NULL
-
-
-cdef inline list get_site_packages():
-    return [site.getusersitepackages()] + site.getsitepackages()
-
-
-cdef load_library(const int driver_ver):
-    handle = 0
-
-    for suffix in get_nvJitLink_dso_version_suffix(driver_ver):
-        if len(suffix) == 0:
-            continue
-        dll_name = f"nvJitLink64_{suffix}.dll"
-
-        # First check if the DLL has been loaded by 3rd parties
-        try:
-            handle = win32api.GetModuleHandle(dll_name)
-        except:
-            pass
-        else:
-            break
-
-        # Next, check if DLLs are installed via pip
-        for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "nvJitLink", "bin")
-            if not os.path.isdir(mod_path):
-                continue
-            os.add_dll_directory(mod_path)
-        try:
-            handle = win32api.LoadLibraryEx(
-                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
-                os.path.join(mod_path, dll_name),
-                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
-        except:
-            pass
-        else:
-            break
-
-        # Finally, try default search
-        try:
-            handle = win32api.LoadLibrary(dll_name)
-        except:
-            pass
-        else:
-            break
-    else:
-        raise RuntimeError('Failed to load nvJitLink')
-
-    assert handle != 0
-    return handle
-
-
-cdef int _check_or_init_nvJitLink() except -1 nogil:
-    global __py_nvJitLink_init
-    if __py_nvJitLink_init:
-        return 0
-
-    cdef int err, driver_ver
-    with gil:
-        # Load driver to check version
-        try:
-            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
-        except Exception as e:
-            raise NotSupportedError(f'CUDA driver is not found ({e})')
-        global __cuDriverGetVersion
-        if __cuDriverGetVersion == NULL:
-            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
-            if __cuDriverGetVersion == NULL:
-                raise RuntimeError('something went wrong')
-        err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
-        if err != 0:
-            raise RuntimeError('something went wrong')
-
-        # Load library
-        handle = load_library(driver_ver)
-
-        # Load function
-        global __nvJitLinkCreate
-        try:
-            __nvJitLinkCreate = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkCreate')
-        except:
-            pass
-    
-        global __nvJitLinkDestroy
-        try:
-            __nvJitLinkDestroy = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkDestroy')
-        except:
-            pass
-    
-        global __nvJitLinkAddData
-        try:
-            __nvJitLinkAddData = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkAddData')
-        except:
-            pass
-    
-        global __nvJitLinkAddFile
-        try:
-            __nvJitLinkAddFile = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkAddFile')
-        except:
-            pass
-    
-        global __nvJitLinkComplete
-        try:
-            __nvJitLinkComplete = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkComplete')
-        except:
-            pass
-    
-        global __nvJitLinkGetLinkedCubinSize
-        try:
-            __nvJitLinkGetLinkedCubinSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedCubinSize')
-        except:
-            pass
-    
-        global __nvJitLinkGetLinkedCubin
-        try:
-            __nvJitLinkGetLinkedCubin = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedCubin')
-        except:
-            pass
-    
-        global __nvJitLinkGetLinkedPtxSize
-        try:
-            __nvJitLinkGetLinkedPtxSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedPtxSize')
-        except:
-            pass
-    
-        global __nvJitLinkGetLinkedPtx
-        try:
-            __nvJitLinkGetLinkedPtx = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetLinkedPtx')
-        except:
-            pass
-    
-        global __nvJitLinkGetErrorLogSize
-        try:
-            __nvJitLinkGetErrorLogSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetErrorLogSize')
-        except:
-            pass
-    
-        global __nvJitLinkGetErrorLog
-        try:
-            __nvJitLinkGetErrorLog = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetErrorLog')
-        except:
-            pass
-    
-        global __nvJitLinkGetInfoLogSize
-        try:
-            __nvJitLinkGetInfoLogSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetInfoLogSize')
-        except:
-            pass
-    
-        global __nvJitLinkGetInfoLog
-        try:
-            __nvJitLinkGetInfoLog = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetInfoLog')
-        except:
-            pass
-
-    __py_nvJitLink_init = True
-    return 0
-
-
-cdef dict func_ptrs = None
-
-
-cpdef dict _inspect_function_pointers():
-    global func_ptrs
-    if func_ptrs is not None:
-        return func_ptrs
-
-    _check_or_init_nvJitLink()
-    cdef dict data = {}
-
-    global __nvJitLinkCreate
-    data["__nvJitLinkCreate"] = <intptr_t>__nvJitLinkCreate
-    
-    global __nvJitLinkDestroy
-    data["__nvJitLinkDestroy"] = <intptr_t>__nvJitLinkDestroy
-    
-    global __nvJitLinkAddData
-    data["__nvJitLinkAddData"] = <intptr_t>__nvJitLinkAddData
-    
-    global __nvJitLinkAddFile
-    data["__nvJitLinkAddFile"] = <intptr_t>__nvJitLinkAddFile
-    
-    global __nvJitLinkComplete
-    data["__nvJitLinkComplete"] = <intptr_t>__nvJitLinkComplete
-    
-    global __nvJitLinkGetLinkedCubinSize
-    data["__nvJitLinkGetLinkedCubinSize"] = <intptr_t>__nvJitLinkGetLinkedCubinSize
-    
-    global __nvJitLinkGetLinkedCubin
-    data["__nvJitLinkGetLinkedCubin"] = <intptr_t>__nvJitLinkGetLinkedCubin
-    
-    global __nvJitLinkGetLinkedPtxSize
-    data["__nvJitLinkGetLinkedPtxSize"] = <intptr_t>__nvJitLinkGetLinkedPtxSize
-    
-    global __nvJitLinkGetLinkedPtx
-    data["__nvJitLinkGetLinkedPtx"] = <intptr_t>__nvJitLinkGetLinkedPtx
-    
-    global __nvJitLinkGetErrorLogSize
-    data["__nvJitLinkGetErrorLogSize"] = <intptr_t>__nvJitLinkGetErrorLogSize
-    
-    global __nvJitLinkGetErrorLog
-    data["__nvJitLinkGetErrorLog"] = <intptr_t>__nvJitLinkGetErrorLog
-    
-    global __nvJitLinkGetInfoLogSize
-    data["__nvJitLinkGetInfoLogSize"] = <intptr_t>__nvJitLinkGetInfoLogSize
-    
-    global __nvJitLinkGetInfoLog
-    data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
-
-    func_ptrs = data
-    return data
-
-
-cpdef _inspect_function_pointer(str name):
-    global func_ptrs
-    if func_ptrs is None:
-        func_ptrs = _inspect_function_pointers()
-    return func_ptrs[name]
-
-
-###############################################################################
-# Wrapper functions
-###############################################################################
-
-cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
-    global __nvJitLinkCreate
-    _check_or_init_nvJitLink()
-    if __nvJitLinkCreate == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkCreate is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle*, uint32_t, const char**) nogil>__nvJitLinkCreate)(
-        handle, numOptions, options)
-
-
-cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
-    global __nvJitLinkDestroy
-    _check_or_init_nvJitLink()
-    if __nvJitLinkDestroy == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle*) nogil>__nvJitLinkDestroy)(
-        handle)
-
-
-cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
-    global __nvJitLinkAddData
-    _check_or_init_nvJitLink()
-    if __nvJitLinkAddData == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkAddData is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const void*, size_t, const char*) nogil>__nvJitLinkAddData)(
-        handle, inputType, data, size, name)
-
-
-cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
-    global __nvJitLinkAddFile
-    _check_or_init_nvJitLink()
-    if __nvJitLinkAddFile == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const char*) nogil>__nvJitLinkAddFile)(
-        handle, inputType, fileName)
-
-
-cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
-    global __nvJitLinkComplete
-    _check_or_init_nvJitLink()
-    if __nvJitLinkComplete == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkComplete is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle) nogil>__nvJitLinkComplete)(
-        handle)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetLinkedCubinSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedCubinSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedCubinSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
-    global __nvJitLinkGetLinkedCubin
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedCubin == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) nogil>__nvJitLinkGetLinkedCubin)(
-        handle, cubin)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetLinkedPtxSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedPtxSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedPtxSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
-    global __nvJitLinkGetLinkedPtx
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetLinkedPtx == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetLinkedPtx)(
-        handle, ptx)
-
-
-cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetErrorLogSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetErrorLogSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetErrorLogSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
-    global __nvJitLinkGetErrorLog
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetErrorLog == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetErrorLog)(
-        handle, log)
-
-
-cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetInfoLogSize
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetInfoLogSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetInfoLogSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
-    global __nvJitLinkGetInfoLog
-    _check_or_init_nvJitLink()
-    if __nvJitLinkGetInfoLog == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
-        handle, log)
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pyx
deleted file mode 100644
index ff7a6ca3a..000000000
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pyx
+++ /dev/null
@@ -1,382 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
-
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_nvjitlink_dso_version_suffix
-
-from .utils import FunctionNotFoundError, NotSupportedError
-
-
-###############################################################################
-# Extern
-###############################################################################
-
-cdef extern from "<dlfcn.h>" nogil:
-    void* dlopen(const char*, int)
-    char* dlerror()
-    void* dlsym(void*, const char*)
-    int dlclose(void*)
-
-    enum:
-        RTLD_LAZY
-        RTLD_NOW
-        RTLD_GLOBAL
-        RTLD_LOCAL
-
-    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
-
-
-###############################################################################
-# Wrapper init
-###############################################################################
-
-cdef bint __py_nvjitlink_init = False
-cdef void* __cuDriverGetVersion = NULL
-
-cdef void* __nvJitLinkCreate = NULL
-cdef void* __nvJitLinkDestroy = NULL
-cdef void* __nvJitLinkAddData = NULL
-cdef void* __nvJitLinkAddFile = NULL
-cdef void* __nvJitLinkComplete = NULL
-cdef void* __nvJitLinkGetLinkedCubinSize = NULL
-cdef void* __nvJitLinkGetLinkedCubin = NULL
-cdef void* __nvJitLinkGetLinkedPtxSize = NULL
-cdef void* __nvJitLinkGetLinkedPtx = NULL
-cdef void* __nvJitLinkGetErrorLogSize = NULL
-cdef void* __nvJitLinkGetErrorLog = NULL
-cdef void* __nvJitLinkGetInfoLogSize = NULL
-cdef void* __nvJitLinkGetInfoLog = NULL
-
-
-cdef void* load_library(const int driver_ver) except* with gil:
-    cdef void* handle
-    for suffix in get_nvjitlink_dso_version_suffix(driver_ver):
-        so_name = "libnvjitlink.so" + (f".{suffix}" if suffix else suffix)
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libnvjitlink ({err_msg.decode()})')
-    return handle
-
-
-cdef int _check_or_init_nvjitlink() except -1 nogil:
-    global __py_nvjitlink_init
-    if __py_nvjitlink_init:
-        return 0
-
-    # Load driver to check version
-    cdef void* handle = NULL
-    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
-    if handle == NULL:
-        with gil:
-            err_msg = dlerror()
-            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
-    global __cuDriverGetVersion
-    if __cuDriverGetVersion == NULL:
-        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
-    if __cuDriverGetVersion == NULL:
-        with gil:
-            raise RuntimeError('something went wrong')
-    cdef int err, driver_ver
-    err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
-    if err != 0:
-        with gil:
-            raise RuntimeError('something went wrong')
-    #dlclose(handle)
-    handle = NULL
-
-    # Load function
-    global __nvJitLinkCreate
-    __nvJitLinkCreate = dlsym(RTLD_DEFAULT, 'nvJitLinkCreate')
-    if __nvJitLinkCreate == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkCreate = dlsym(handle, 'nvJitLinkCreate')
-    
-    global __nvJitLinkDestroy
-    __nvJitLinkDestroy = dlsym(RTLD_DEFAULT, 'nvJitLinkDestroy')
-    if __nvJitLinkDestroy == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkDestroy = dlsym(handle, 'nvJitLinkDestroy')
-    
-    global __nvJitLinkAddData
-    __nvJitLinkAddData = dlsym(RTLD_DEFAULT, 'nvJitLinkAddData')
-    if __nvJitLinkAddData == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkAddData = dlsym(handle, 'nvJitLinkAddData')
-    
-    global __nvJitLinkAddFile
-    __nvJitLinkAddFile = dlsym(RTLD_DEFAULT, 'nvJitLinkAddFile')
-    if __nvJitLinkAddFile == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkAddFile = dlsym(handle, 'nvJitLinkAddFile')
-    
-    global __nvJitLinkComplete
-    __nvJitLinkComplete = dlsym(RTLD_DEFAULT, 'nvJitLinkComplete')
-    if __nvJitLinkComplete == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkComplete = dlsym(handle, 'nvJitLinkComplete')
-    
-    global __nvJitLinkGetLinkedCubinSize
-    __nvJitLinkGetLinkedCubinSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubinSize')
-    if __nvJitLinkGetLinkedCubinSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetLinkedCubinSize = dlsym(handle, 'nvJitLinkGetLinkedCubinSize')
-    
-    global __nvJitLinkGetLinkedCubin
-    __nvJitLinkGetLinkedCubin = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedCubin')
-    if __nvJitLinkGetLinkedCubin == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetLinkedCubin = dlsym(handle, 'nvJitLinkGetLinkedCubin')
-    
-    global __nvJitLinkGetLinkedPtxSize
-    __nvJitLinkGetLinkedPtxSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtxSize')
-    if __nvJitLinkGetLinkedPtxSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetLinkedPtxSize = dlsym(handle, 'nvJitLinkGetLinkedPtxSize')
-    
-    global __nvJitLinkGetLinkedPtx
-    __nvJitLinkGetLinkedPtx = dlsym(RTLD_DEFAULT, 'nvJitLinkGetLinkedPtx')
-    if __nvJitLinkGetLinkedPtx == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetLinkedPtx = dlsym(handle, 'nvJitLinkGetLinkedPtx')
-    
-    global __nvJitLinkGetErrorLogSize
-    __nvJitLinkGetErrorLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLogSize')
-    if __nvJitLinkGetErrorLogSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetErrorLogSize = dlsym(handle, 'nvJitLinkGetErrorLogSize')
-    
-    global __nvJitLinkGetErrorLog
-    __nvJitLinkGetErrorLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetErrorLog')
-    if __nvJitLinkGetErrorLog == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetErrorLog = dlsym(handle, 'nvJitLinkGetErrorLog')
-    
-    global __nvJitLinkGetInfoLogSize
-    __nvJitLinkGetInfoLogSize = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLogSize')
-    if __nvJitLinkGetInfoLogSize == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetInfoLogSize = dlsym(handle, 'nvJitLinkGetInfoLogSize')
-    
-    global __nvJitLinkGetInfoLog
-    __nvJitLinkGetInfoLog = dlsym(RTLD_DEFAULT, 'nvJitLinkGetInfoLog')
-    if __nvJitLinkGetInfoLog == NULL:
-        if handle == NULL:
-            handle = load_library(driver_ver)
-        __nvJitLinkGetInfoLog = dlsym(handle, 'nvJitLinkGetInfoLog')
-
-    __py_nvjitlink_init = True
-    return 0
-
-
-cdef dict func_ptrs = None
-
-
-cpdef dict _inspect_function_pointers():
-    global func_ptrs
-    if func_ptrs is not None:
-        return func_ptrs
-
-    _check_or_init_nvjitlink()
-    cdef dict data = {}
-
-    global __nvJitLinkCreate
-    data["__nvJitLinkCreate"] = <intptr_t>__nvJitLinkCreate
-    
-    global __nvJitLinkDestroy
-    data["__nvJitLinkDestroy"] = <intptr_t>__nvJitLinkDestroy
-    
-    global __nvJitLinkAddData
-    data["__nvJitLinkAddData"] = <intptr_t>__nvJitLinkAddData
-    
-    global __nvJitLinkAddFile
-    data["__nvJitLinkAddFile"] = <intptr_t>__nvJitLinkAddFile
-    
-    global __nvJitLinkComplete
-    data["__nvJitLinkComplete"] = <intptr_t>__nvJitLinkComplete
-    
-    global __nvJitLinkGetLinkedCubinSize
-    data["__nvJitLinkGetLinkedCubinSize"] = <intptr_t>__nvJitLinkGetLinkedCubinSize
-    
-    global __nvJitLinkGetLinkedCubin
-    data["__nvJitLinkGetLinkedCubin"] = <intptr_t>__nvJitLinkGetLinkedCubin
-    
-    global __nvJitLinkGetLinkedPtxSize
-    data["__nvJitLinkGetLinkedPtxSize"] = <intptr_t>__nvJitLinkGetLinkedPtxSize
-    
-    global __nvJitLinkGetLinkedPtx
-    data["__nvJitLinkGetLinkedPtx"] = <intptr_t>__nvJitLinkGetLinkedPtx
-    
-    global __nvJitLinkGetErrorLogSize
-    data["__nvJitLinkGetErrorLogSize"] = <intptr_t>__nvJitLinkGetErrorLogSize
-    
-    global __nvJitLinkGetErrorLog
-    data["__nvJitLinkGetErrorLog"] = <intptr_t>__nvJitLinkGetErrorLog
-    
-    global __nvJitLinkGetInfoLogSize
-    data["__nvJitLinkGetInfoLogSize"] = <intptr_t>__nvJitLinkGetInfoLogSize
-    
-    global __nvJitLinkGetInfoLog
-    data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
-
-    func_ptrs = data
-    return data
-
-
-cpdef _inspect_function_pointer(str name):
-    global func_ptrs
-    if func_ptrs is None:
-        func_ptrs = _inspect_function_pointers()
-    return func_ptrs[name]
-
-
-###############################################################################
-# Wrapper functions
-###############################################################################
-
-cdef nvJitLinkResult _nvJitLinkCreate(nvJitLinkHandle* handle, uint32_t numOptions, const char** options) except* nogil:
-    global __nvJitLinkCreate
-    _check_or_init_nvjitlink()
-    if __nvJitLinkCreate == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkCreate is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle*, uint32_t, const char**) nogil>__nvJitLinkCreate)(
-        handle, numOptions, options)
-
-
-cdef nvJitLinkResult _nvJitLinkDestroy(nvJitLinkHandle* handle) except* nogil:
-    global __nvJitLinkDestroy
-    _check_or_init_nvjitlink()
-    if __nvJitLinkDestroy == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkDestroy is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle*) nogil>__nvJitLinkDestroy)(
-        handle)
-
-
-cdef nvJitLinkResult _nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void* data, size_t size, const char* name) except* nogil:
-    global __nvJitLinkAddData
-    _check_or_init_nvjitlink()
-    if __nvJitLinkAddData == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkAddData is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const void*, size_t, const char*) nogil>__nvJitLinkAddData)(
-        handle, inputType, data, size, name)
-
-
-cdef nvJitLinkResult _nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char* fileName) except* nogil:
-    global __nvJitLinkAddFile
-    _check_or_init_nvjitlink()
-    if __nvJitLinkAddFile == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkAddFile is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, nvJitLinkInputType, const char*) nogil>__nvJitLinkAddFile)(
-        handle, inputType, fileName)
-
-
-cdef nvJitLinkResult _nvJitLinkComplete(nvJitLinkHandle handle) except* nogil:
-    global __nvJitLinkComplete
-    _check_or_init_nvjitlink()
-    if __nvJitLinkComplete == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkComplete is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle) nogil>__nvJitLinkComplete)(
-        handle)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetLinkedCubinSize
-    _check_or_init_nvjitlink()
-    if __nvJitLinkGetLinkedCubinSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubinSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedCubinSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void* cubin) except* nogil:
-    global __nvJitLinkGetLinkedCubin
-    _check_or_init_nvjitlink()
-    if __nvJitLinkGetLinkedCubin == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedCubin is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, void*) nogil>__nvJitLinkGetLinkedCubin)(
-        handle, cubin)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetLinkedPtxSize
-    _check_or_init_nvjitlink()
-    if __nvJitLinkGetLinkedPtxSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtxSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetLinkedPtxSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) except* nogil:
-    global __nvJitLinkGetLinkedPtx
-    _check_or_init_nvjitlink()
-    if __nvJitLinkGetLinkedPtx == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetLinkedPtx is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetLinkedPtx)(
-        handle, ptx)
-
-
-cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetErrorLogSize
-    _check_or_init_nvjitlink()
-    if __nvJitLinkGetErrorLogSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetErrorLogSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetErrorLogSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil:
-    global __nvJitLinkGetErrorLog
-    _check_or_init_nvjitlink()
-    if __nvJitLinkGetErrorLog == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetErrorLog is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetErrorLog)(
-        handle, log)
-
-
-cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil:
-    global __nvJitLinkGetInfoLogSize
-    _check_or_init_nvjitlink()
-    if __nvJitLinkGetInfoLogSize == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetInfoLogSize is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, size_t*) nogil>__nvJitLinkGetInfoLogSize)(
-        handle, size)
-
-
-cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
-    global __nvJitLinkGetInfoLog
-    _check_or_init_nvjitlink()
-    if __nvJitLinkGetInfoLog == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
-    return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
-        handle, log)
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index f0aaee771..592464487 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -61,7 +61,7 @@
                  'cuda_gl_interop.h',
                  'cuda_vdpau_interop.h'],
     'nvrtc' : ['nvrtc.h'],
-    'nvJitLink' : ['nvJitLink.h'],}
+    'nvJitLink' : ['nvJitLink.h']}
 
 replace = {' __device_builtin__ ':' ',
            'CUDARTAPI ':' ',
@@ -93,13 +93,16 @@
                 break
         if not os.path.exists(path):
             print(f'Missing header {header}')
+
     print(f'Parsing {library} headers')
     parser = CParser(header_paths,
                      cache='./cache_{}'.format(library.split('.')[0]) if PARSER_CACHING else None,
                      replace=replace)
+    
     if library == 'driver':
         CUDA_VERSION = parser.defs['macros']['CUDA_VERSION'] if 'CUDA_VERSION' in parser.defs['macros'] else 'Unknown'
         print(f'Found CUDA_VERSION: {CUDA_VERSION}')
+
     # Combine types with others since they sometimes get tangled
     found_types += {key for key in parser.defs['types']}
     found_types += {key for key in parser.defs['structs']}
@@ -109,13 +112,16 @@
     found_types += {key for key in parser.defs['enums']}
     found_functions += {key for key in parser.defs['functions']}
     found_values += {key for key in parser.defs['values']}
+
 if len(found_functions) == 0:
     raise RuntimeError(f'Parser found no functions. Is CUDA_HOME setup correctly? (CUDA_HOME="{CUDA_HOME}")')
+
 # Unwrap struct and union members
 def unwrapMembers(found_dict):
     for key in found_dict:
         members = [var for var, _, _ in found_dict[key]['members']]
         found_dict[key]['members'] = members
+        
 unwrapMembers(found_structs)
 unwrapMembers(found_unions)
 
diff --git a/cuda_bindings/tests/test_nvJitLink.py b/cuda_bindings/tests/test_nvJitLink.py
deleted file mode 100644
index f566ae7c6..000000000
--- a/cuda_bindings/tests/test_nvJitLink.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import pytest
-from cuda import nvJitLink
-
-def test_create_no_arch_error():
-    # nvjitlink expects at least the architecture to be specified.
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_MISSING_ARCH error"):
-        nvJitLink.create()
-
-
-def test_invalid_arch_error():
-    # sm_XX is not a valid architecture
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
-        nvJitLink.create("-arch=sm_XX")
-
-
-def test_unrecognized_option_error():
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
-        nvJitLink.create("-fictitious_option")
-
-
-def test_invalid_option_type_error():
-    with pytest.raises(TypeError, match="Expecting only strings"):
-        nvJitLink.create("-arch", 53)
-
-
-def test_create_and_destroy():
-    handle = nvJitLink.create("-arch=sm_53")
-    assert handle != 0
-    nvJitLink.destroy(handle)
-
-
-def test_complete_empty():
-    handle = nvJitLink.create("-arch=sm_75")
-    nvJitLink.complete(handle)
-    nvJitLink.destroy(handle)
-
-
-@pytest.mark.parametrize(
-    "input_file,input_type",
-    [
-        ("device_functions_cubin", nvJitLink.InputType.CUBIN),
-        ("device_functions_fatbin", InputType.FATBIN),
-        ("device_functions_ptx", InputType.PTX),
-        ("device_functions_object", InputType.OBJECT),
-        ("device_functions_archive", InputType.LIBRARY),
-    ],
-)
-def test_add_file(input_file, input_type, gpu_arch_flag, request):
-    filename, data = request.getfixturevalue(input_file)
-
-    handle = nvJitLink.create(gpu_arch_flag)
-    nvJitLink.add_data(handle, input_type.value, data, filename)
-    nvJitLink.destroy(handle)
-
-
-# We test the LTO input case separately as it requires the `-lto` flag. The
-# OBJECT input type is used because the LTO-IR container is packaged in an ELF
-# object when produced by NVCC.
-def test_add_file_lto(device_functions_ltoir_object, gpu_arch_flag):
-    filename, data = device_functions_ltoir_object
-
-    handle = nvJitLink.create(gpu_arch_flag, "-lto")
-    nvJitLink.add_data(handle, InputType.OBJECT.value, data, filename)
-    nvJitLink.destroy(handle)
-
-
-def test_get_error_log(undefined_extern_cubin, gpu_arch_flag):
-    handle = nvJitLink.create(gpu_arch_flag)
-    filename, data = undefined_extern_cubin
-    input_type = InputType.CUBIN.value
-    nvJitLink.add_data(handle, input_type, data, filename)
-    with pytest.raises(RuntimeError):
-        nvJitLink.complete(handle)
-    error_log = nvJitLink.get_error_log(handle)
-    nvJitLink.destroy(handle)
-    assert (
-        "Undefined reference to '_Z5undefff' "
-        "in 'undefined_extern.cubin'" in error_log
-    )
-
-
-def test_get_info_log(device_functions_cubin, gpu_arch_flag):
-    handle = nvJitLink.create(gpu_arch_flag)
-    filename, data = device_functions_cubin
-    input_type = InputType.CUBIN.value
-    nvJitLink.add_data(handle, input_type, data, filename)
-    nvJitLink.complete(handle)
-    info_log = nvJitLink.get_info_log(handle)
-    nvJitLink.destroy(handle)
-    # Info log is empty
-    assert "" == info_log
-
-
-def test_get_linked_cubin(device_functions_cubin, gpu_arch_flag):
-    handle = nvJitLink.create(gpu_arch_flag)
-    filename, data = device_functions_cubin
-    input_type = InputType.CUBIN.value
-    nvJitLink.add_data(handle, input_type, data, filename)
-    nvJitLink.complete(handle)
-    cubin = nvJitLink.get_linked_cubin(handle)
-    nvJitLink.destroy(handle)
-
-    # Just check we got something that looks like an ELF
-    assert cubin[:4] == b"\x7fELF"
-
-
-def test_get_linked_cubin_link_not_complete_error(
-    device_functions_cubin, gpu_arch_flag
-):
-    handle = nvJitLink.create(gpu_arch_flag)
-    filename, data = device_functions_cubin
-    input_type = InputType.CUBIN.value
-    nvJitLink.add_data(handle, input_type, data, filename)
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
-        nvJitLink.get_linked_cubin(handle)
-    nvJitLink.destroy(handle)
-
-
-def test_get_linked_cubin_from_lto(device_functions_ltoir_object, gpu_arch_flag):
-    filename, data = device_functions_ltoir_object
-    # device_functions_ltoir_object is a host object containing a fatbin
-    # containing an LTOIR container, because that is what NVCC produces when
-    # LTO is requested. So we need to use the OBJECT input type, and the linker
-    # retrieves the LTO IR from it because we passed the -lto flag.
-    input_type = InputType.OBJECT.value
-    handle = nvJitLink.create(gpu_arch_flag, "-lto")
-    nvJitLink.add_data(handle, input_type, data, filename)
-    nvJitLink.complete(handle)
-    cubin = nvJitLink.get_linked_cubin(handle)
-    nvJitLink.destroy(handle)
-
-    # Just check we got something that looks like an ELF
-    assert cubin[:4] == b"\x7fELF"
-
-
-def test_get_linked_ptx_from_lto(device_functions_ltoir_object, gpu_arch_flag):
-    filename, data = device_functions_ltoir_object
-    # device_functions_ltoir_object is a host object containing a fatbin
-    # containing an LTOIR container, because that is what NVCC produces when
-    # LTO is requested. So we need to use the OBJECT input type, and the linker
-    # retrieves the LTO IR from it because we passed the -lto flag.
-    input_type = InputType.OBJECT.value
-    handle = nvJitLink.create(gpu_arch_flag, "-lto", "-ptx")
-    nvJitLink.add_data(handle, input_type, data, filename)
-    nvJitLink.complete(handle)
-    nvJitLink.get_linked_ptx(handle)
-    nvJitLink.destroy(handle)
-
-
-def test_get_linked_ptx_link_not_complete_error(
-    device_functions_ltoir_object, gpu_arch_flag
-):
-    handle = nvJitLink.create(gpu_arch_flag, "-lto", "-ptx")
-    filename, data = device_functions_ltoir_object
-    input_type = InputType.OBJECT.value
-    nvJitLink.add_data(handle, input_type, data, filename)
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
-        nvJitLink.get_linked_ptx(handle)
-    nvJitLink.destroy(handle)
-
-
-def test_package_version():
-    assert pynvjitlink.__version__ is not None
-    assert len(str(pynvjitlink.__version__)) > 0
\ No newline at end of file

From f7f4fe58beb9d08cebd63632d69c156b54177246 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 19 Oct 2024 05:09:23 +0000
Subject: [PATCH 12/34] regenerate

---
 .../cuda/bindings/_internal/nvjitlink.pxd     |  4 +-
 .../bindings/_internal/nvjitlink_linux.pyx    |  7 +-
 .../bindings/_internal/nvjitlink_windows.pyx  | 13 ++--
 .../cuda/bindings/_internal/utils.pxd         | 21 ++----
 .../cuda/bindings/_internal/utils.pyx         | 75 +++++++++----------
 cuda_bindings/cuda/bindings/cynvjitlink.pxd   | 15 +---
 cuda_bindings/cuda/bindings/cynvjitlink.pyx   | 14 +---
 cuda_bindings/cuda/bindings/nvjitlink.pxd     | 12 +--
 cuda_bindings/cuda/bindings/nvjitlink.pyx     | 30 +++++---
 9 files changed, 84 insertions(+), 107 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
index ac3a9023b..bca8867df 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
@@ -1,8 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 12.6.2. Do not modify it directly.
 
 from ..cynvjitlink cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index ff7a6ca3a..ab3d42be3 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 12.6.2. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -10,7 +10,6 @@ from .utils cimport get_nvjitlink_dso_version_suffix
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
-
 ###############################################################################
 # Extern
 ###############################################################################
@@ -55,13 +54,13 @@ cdef void* __nvJitLinkGetInfoLog = NULL
 cdef void* load_library(const int driver_ver) except* with gil:
     cdef void* handle
     for suffix in get_nvjitlink_dso_version_suffix(driver_ver):
-        so_name = "libnvjitlink.so" + (f".{suffix}" if suffix else suffix)
+        so_name = "libnvJitLink.so" + (f".{suffix}" if suffix else suffix)
         handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
         if handle != NULL:
             break
     else:
         err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libnvjitlink ({err_msg.decode()})')
+        raise RuntimeError(f'Failed to dlopen libnvJitLink ({err_msg.decode()})')
     return handle
 
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index 5cac180f3..feddec3ca 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -2,19 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 12.6.2. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
-from utils cimport get_nvjitlink_dso_version_suffix
+from .utils cimport get_nvjitlink_dso_version_suffix
+
 
 import os
 import site
 
 import win32api
 
-from utils import FunctionNotFoundError, NotSupportedError
-
 
 ###############################################################################
 # Wrapper init
@@ -51,7 +50,7 @@ cdef load_library(const int driver_ver):
     for suffix in get_nvjitlink_dso_version_suffix(driver_ver):
         if len(suffix) == 0:
             continue
-        dll_name = f"nvjitlink64_{suffix}.dll"
+        dll_name = f"nvJitLink64_{suffix}.dll"
 
         # First check if the DLL has been loaded by 3rd parties
         try:
@@ -63,7 +62,7 @@ cdef load_library(const int driver_ver):
 
         # Next, check if DLLs are installed via pip
         for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "nvjitlink", "bin")
+            mod_path = os.path.join(sp, "nvidia", "nvJitLink", "bin")
             if not os.path.isdir(mod_path):
                 continue
             os.add_dll_directory(mod_path)
@@ -85,7 +84,7 @@ cdef load_library(const int driver_ver):
         else:
             break
     else:
-        raise RuntimeError('Failed to load nvjitlink')
+        raise RuntimeError('Failed to load nvJitLink')
 
     assert handle != 0
     return handle
diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pxd b/cuda_bindings/cuda/bindings/_internal/utils.pxd
index 225ab3648..be5d4ad61 100644
--- a/cuda_bindings/cuda/bindings/_internal/utils.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pxd
@@ -136,13 +136,6 @@ cdef extern from * nogil:
         void* data()
 
 
-cdef extern from "<cuComplex.h>" nogil:
-    ctypedef struct cuComplex:
-        pass
-    ctypedef struct cuDoubleComplex:
-        pass
-
-
 ctypedef fused ResT:
     int
     int32_t
@@ -150,10 +143,6 @@ ctypedef fused ResT:
 
 
 ctypedef fused PtrT:
-    float
-    double
-    cuComplex
-    cuDoubleComplex
     void
 
 
@@ -161,10 +150,12 @@ cdef cppclass nested_resource[T]:
     nullable_unique_ptr[ vector[intptr_t] ] ptrs
     nullable_unique_ptr[ vector[vector[T]] ] nested_resource_ptr
 
-cdef nullable_unique_ptr[ vector[ResT] ] get_resource_ptr_(object obj, ResT* __unused)
-cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 0
-cdef nullable_unique_ptr[ vector[PtrT*] ] get_resource_ptrs(object obj, PtrT* __unused)
-cdef nested_resource[ResT] get_nested_resource_ptr(object obj, ResT* __unused)
+
+# accepts the output pointer as input to use the return value for exception propagation
+cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 1
+cdef int get_resource_ptrs(nullable_unique_ptr[ vector[PtrT*] ] &in_out_ptr, object obj, PtrT* __unused) except 1
+cdef int get_char_ptrs(nullable_unique_ptr[ vector[char*] ] &in_out_ptr, object obj) except 1
+cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj, ResT* __unused) except 1
 
 cdef bint is_nested_sequence(data)
 cdef void* get_buffer_pointer(buf, Py_ssize_t size, readonly=*) except*
diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pyx b/cuda_bindings/cuda/bindings/_internal/utils.pyx
index b575ddc03..904e08da0 100644
--- a/cuda_bindings/cuda/bindings/_internal/utils.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pyx
@@ -46,51 +46,47 @@ cdef void* get_buffer_pointer(buf, Py_ssize_t size, readonly=True) except*:
     return bufPtr
 
 
-# Cython can't infer the overload by return type alone, so we need a dummy
-# input argument to help it
-cdef nullable_unique_ptr[ vector[ResT] ] get_resource_ptr_(object obj, ResT* __unused):
-    cdef nullable_unique_ptr[ vector[ResT] ] ptr
-    cdef vector[ResT]* vec
+# Cython can't infer the ResT overload when it is wrapped in nullable_unique_ptr,
+# so we need a dummy (__unused) input argument to help it
+cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 1:
     if cpython.PySequence_Check(obj):
         vec = new vector[ResT](len(obj))
+        # set the ownership immediately to avoid leaking the `vec` memory in
+        # case of exception in the following loop
+        in_out_ptr.reset(vec, True)
         for i in range(len(obj)):
             deref(vec)[i] = obj[i]
-        ptr.reset(vec, True)
     else:
-        ptr.reset(<vector[ResT]*><intptr_t>obj, False)
-    return move(ptr)
+        in_out_ptr.reset(<vector[ResT]*><intptr_t>obj, False)
+    return 0
+
 
-cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 0:
-    cdef vector[ResT]* vec
+cdef int get_resource_ptrs(nullable_unique_ptr[ vector[PtrT*] ] &in_out_ptr, object obj, PtrT* __unused) except 1:
     if cpython.PySequence_Check(obj):
-        vec = new vector[ResT](len(obj))
-        # set the ownership immediately to avoid
-        # leaking the `vec` memory in case of exception 
-        # (e.g. ResT type range overflow)
-        # when populating the memory in the loop
+        vec = new vector[PtrT*](len(obj))
+        # set the ownership immediately to avoid leaking the `vec` memory in
+        # case of exception in the following loop
         in_out_ptr.reset(vec, True)
         for i in range(len(obj)):
-            deref(vec)[i] = obj[i]
+            deref(vec)[i] = <PtrT*><intptr_t>(obj[i])
     else:
-        in_out_ptr.reset(<vector[ResT]*><intptr_t>obj, False)
-    return 1
+        in_out_ptr.reset(<vector[PtrT*]*><intptr_t>obj, False)
+    return 0
 
 
-cdef nullable_unique_ptr[ vector[PtrT*] ] get_resource_ptrs(object obj, PtrT* __unused):
-    cdef nullable_unique_ptr[ vector[PtrT*] ] ptr
-    cdef vector[PtrT*]* vec
+cdef int get_char_ptrs(nullable_unique_ptr[ vector[char*] ] &in_out_ptr, object obj) except 1:
     if cpython.PySequence_Check(obj):
-        vec = new vector[PtrT*](len(obj))
+        vec = new vector[char*](len(obj))
+        in_out_ptr.reset(vec, True)
         for i in range(len(obj)):
-            deref(vec)[i] = <PtrT*><intptr_t>(obj[i])
-        ptr.reset(vec, True)
+            #__TODO__ is there a lifetime difference between this char* and some other ptrT*
+            deref(vec)[i] = obj[i]
     else:
-        ptr.reset(<vector[PtrT*]*><intptr_t>obj, False)
-    return move(ptr)
+        in_out_ptr.reset(<vector[char*]*><intptr_t>obj, False)
+    return 0
 
 
-cdef nested_resource[ResT] get_nested_resource_ptr(object obj, ResT* __unused):
-    cdef nested_resource[ResT] res
+cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj, ResT* __unused) except 1:
     cdef nullable_unique_ptr[ vector[intptr_t] ] nested_ptr
     cdef nullable_unique_ptr[ vector[vector[ResT]] ] nested_res_ptr
     cdef vector[intptr_t]* nested_vec = NULL
@@ -102,26 +98,28 @@ cdef nested_resource[ResT] get_nested_resource_ptr(object obj, ResT* __unused):
         length = len(obj)
         nested_res_vec = new vector[vector[ResT]](length)
         nested_vec = new vector[intptr_t](length)
+        # set the ownership immediately to avoid leaking memory in case of
+        # exception in the following loop
+        nested_res_ptr.reset(nested_res_vec, True)
+        nested_ptr.reset(nested_vec, True)
         for i, obj_i in enumerate(obj):
             deref(nested_res_vec)[i] = obj_i
             deref(nested_vec)[i] = <intptr_t>(deref(nested_res_vec)[i].data())
-        nested_res_ptr.reset(nested_res_vec, True)
-        nested_ptr.reset(nested_vec, True)
     elif cpython.PySequence_Check(obj):
         length = len(obj)
         nested_vec = new vector[intptr_t](length)
+        nested_ptr.reset(nested_vec, True)
         for i, addr in enumerate(obj):
             deref(nested_vec)[i] = addr
         nested_res_ptr.reset(NULL, False)
-        nested_ptr.reset(nested_vec, True)
     else:
         # obj is an int (ResT**)
         nested_res_ptr.reset(NULL, False)
         nested_ptr.reset(<vector[intptr_t]*><intptr_t>obj, False)
 
-    res.ptrs = move(nested_ptr)
-    res.nested_resource_ptr = move(nested_res_ptr)
-    return move(res)
+    in_out_ptr.ptrs = move(nested_ptr)
+    in_out_ptr.nested_resource_ptr = move(nested_res_ptr)
+    return 0
 
 
 class FunctionNotFoundError(RuntimeError): pass
@@ -130,10 +128,7 @@ class NotSupportedError(RuntimeError): pass
 
 
 cdef tuple get_nvjitlink_dso_version_suffix(int driver_ver):
-    # applicable to both cuBLAS and cuBLASLt
-    if 11000 <= driver_ver < 12000:
-        return ('11', '')
-    elif 12000 <= driver_ver < 13000:
-        return ('12', '11', '')
+    if 12000 <= driver_ver < 13000:
+        return ('12', '')
     else:
-        raise NotSupportedError('only CUDA 11/12 driver is supported')
\ No newline at end of file
+        raise NotSupportedError('only CUDA 12 driver is supported')
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pxd b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
index 3f4134706..45c80d3af 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
@@ -1,19 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 12.6.2. Do not modify it directly.
 
-
-<<<<<<< HEAD
-<<<<<<< HEAD
-from libc.stdint cimport uint32_t
-=======
 from libc.stdint cimport intptr_t, uint32_t
->>>>>>> 5d60eb1 (more changes)
-=======
-from libc.stdint cimport uint32_t
->>>>>>> 8c4029f (working)
 
 
 ###############################################################################
@@ -74,4 +65,4 @@ cdef nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char* ptx) ex
 cdef nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
 cdef nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
 cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
-cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
\ No newline at end of file
+cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pyx b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
index c91948f03..3d55097b0 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
@@ -1,18 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 12.6.2. Do not modify it directly.
 
 from ._internal cimport nvjitlink as _nvjitlink
-<<<<<<< HEAD
-<<<<<<< HEAD
-from libc.stdint cimport uint32_t
-=======
->>>>>>> 5d60eb1 (more changes)
-=======
-from libc.stdint cimport uint32_t
->>>>>>> 8c4029f (working)
 
 
 ###############################################################################
@@ -68,4 +60,4 @@ cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* siz
 
 
 cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
-    return _nvjitlink._nvJitLinkGetInfoLog(handle, log)
\ No newline at end of file
+    return _nvjitlink._nvJitLinkGetInfoLog(handle, log)
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pxd b/cuda_bindings/cuda/bindings/nvjitlink.pxd
index de4d46170..3cec24841 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pxd
@@ -1,8 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 12.6.2. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
@@ -20,15 +20,15 @@ ctypedef nvJitLinkHandle Handle
 # Enum
 ###############################################################################
 
-ctypedef nvJitLinkResult _NvJitLinkResult
-ctypedef nvJitLinkInputType _NvJitLinkInputType
+ctypedef nvJitLinkResult _Result
+ctypedef nvJitLinkInputType _InputType
 
 
 ###############################################################################
 # Functions
 ###############################################################################
 
-cpdef create(intptr_t handle, uint32_t num_options, intptr_t options)
+cpdef intptr_t create(uint32_t num_options, options) except -1
 cpdef destroy(intptr_t handle)
 cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, intptr_t name)
 cpdef add_file(intptr_t handle, int input_type, intptr_t file_name)
@@ -40,4 +40,4 @@ cpdef get_linked_ptx(intptr_t handle, intptr_t ptx)
 cpdef get_error_log_size(intptr_t handle, intptr_t size)
 cpdef get_error_log(intptr_t handle, intptr_t log)
 cpdef get_info_log_size(intptr_t handle, intptr_t size)
-cpdef get_info_log(intptr_t handle, intptr_t log)
\ No newline at end of file
+cpdef get_info_log(intptr_t handle, intptr_t log)
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index 8c1a89976..9156d970c 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -1,19 +1,23 @@
 # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.76 to 12.6.77. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 12.6.2. Do not modify it directly.
 
 cimport cython  # NOQA
 
+from ._internal.utils cimport (get_resource_ptr, get_nested_resource_ptr, nested_resource, nullable_unique_ptr,
+                       get_buffer_pointer, get_resource_ptrs, get_char_ptrs)
+
 from enum import IntEnum as _IntEnum
+from libcpp.vector cimport vector
 
 
 ###############################################################################
 # Enum
 ###############################################################################
 
-class NvJitLinkResult(_IntEnum):
+class Result(_IntEnum):
     """See `nvJitLinkResult`."""
     SUCCESS = NVJITLINK_SUCCESS
     ERROR_UNRECOGNIZED_OPTION = NVJITLINK_ERROR_UNRECOGNIZED_OPTION
@@ -26,7 +30,7 @@ class NvJitLinkResult(_IntEnum):
     ERROR_UNRECOGNIZED_INPUT = NVJITLINK_ERROR_UNRECOGNIZED_INPUT
     ERROR_FINALIZE = NVJITLINK_ERROR_FINALIZE
 
-class NvJitLinkInputType(_IntEnum):
+class InputType(_IntEnum):
     """See `nvJitLinkInputType`."""
     INPUT_NONE = NVJITLINK_INPUT_NONE
     INPUT_CUBIN = NVJITLINK_INPUT_CUBIN
@@ -65,20 +69,26 @@ class nvJitLinkError(Exception):
 
 
 @cython.profile(False)
-cdef inline void check_status(int status) nogil:
+cdef int check_status(int status) except 1 nogil:
     if status != 0:
         with gil:
             raise nvJitLinkError(status)
+    return status
 
 
 ###############################################################################
 # Wrapper functions
 ###############################################################################
 
-cpdef create(intptr_t handle, uint32_t num_options, intptr_t options):
+cpdef intptr_t create(uint32_t num_options, options) except -1:
+    cdef list converted_options = [(<str?>(s)).encode() for s in options]
+    cdef nullable_unique_ptr[ vector[char*] ] _options_
+    get_char_ptrs(_options_, converted_options)
+    cdef Handle handle
     with nogil:
-        status = nvJitLinkCreate(<Handle*>handle, num_options, <const char**>options)
+        status = nvJitLinkCreate(&handle, num_options, <const char**>(_options_.data()))
     check_status(status)
+    return <intptr_t>handle
 
 
 cpdef destroy(intptr_t handle):
@@ -89,13 +99,13 @@ cpdef destroy(intptr_t handle):
 
 cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, intptr_t name):
     with nogil:
-        status = nvJitLinkAddData(<Handle>handle, <_NvJitLinkInputType>input_type, <const void*>data, size, <const char*>name)
+        status = nvJitLinkAddData(<Handle>handle, <_InputType>input_type, <const void*>data, size, <const char*>name)
     check_status(status)
 
 
 cpdef add_file(intptr_t handle, int input_type, intptr_t file_name):
     with nogil:
-        status = nvJitLinkAddFile(<Handle>handle, <_NvJitLinkInputType>input_type, <const char*>file_name)
+        status = nvJitLinkAddFile(<Handle>handle, <_InputType>input_type, <const char*>file_name)
     check_status(status)
 
 
@@ -150,4 +160,4 @@ cpdef get_info_log_size(intptr_t handle, intptr_t size):
 cpdef get_info_log(intptr_t handle, intptr_t log):
     with nogil:
         status = nvJitLinkGetInfoLog(<Handle>handle, <char*>log)
-    check_status(status)
\ No newline at end of file
+    check_status(status)

From 990f4cbe308051df256346c18aafd8e443ab1875 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 19 Oct 2024 05:11:14 +0000
Subject: [PATCH 13/34] clean up a bit

---
 cuda_bindings/setup.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 592464487..9d0f6fad3 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -60,8 +60,7 @@
                  'cuda_egl_interop.h',
                  'cuda_gl_interop.h',
                  'cuda_vdpau_interop.h'],
-    'nvrtc' : ['nvrtc.h'],
-    'nvJitLink' : ['nvJitLink.h']}
+    'nvrtc' : ['nvrtc.h']}
 
 replace = {' __device_builtin__ ':' ',
            'CUDARTAPI ':' ',
@@ -98,7 +97,7 @@
     parser = CParser(header_paths,
                      cache='./cache_{}'.format(library.split('.')[0]) if PARSER_CACHING else None,
                      replace=replace)
-    
+
     if library == 'driver':
         CUDA_VERSION = parser.defs['macros']['CUDA_VERSION'] if 'CUDA_VERSION' in parser.defs['macros'] else 'Unknown'
         print(f'Found CUDA_VERSION: {CUDA_VERSION}')
@@ -121,7 +120,7 @@ def unwrapMembers(found_dict):
     for key in found_dict:
         members = [var for var, _, _ in found_dict[key]['members']]
         found_dict[key]['members'] = members
-        
+
 unwrapMembers(found_structs)
 unwrapMembers(found_unions)
 
@@ -198,7 +197,6 @@ def prep_extensions(sources):
     pattern = sources[0]
     files = glob.glob(pattern)
     exts = []
-    print(include_dirs, library_dirs)
     for pyx in files:
         mod_name = pyx.replace(".pyx", "").replace(os.sep, ".").replace("/", ".")
         exts.append(
@@ -215,8 +213,10 @@ def prep_extensions(sources):
         )
     return exts
 
+
 # new path for the bindings from cybind
-def rename_architecture_specific_files(path):
+def rename_architecture_specific_files():
+    architechture_specific_files_dir = 'cuda/bindings/_internal/'
     if sys.platform == 'linux':
         src_files = glob.glob(os.path.join(path, '*_linux.pyx'))
     elif sys.platform == 'win32':
@@ -234,16 +234,20 @@ def rename_architecture_specific_files(path):
         # atomic move with the destination guaranteed to be overwritten
         os.replace(f_name, f"./{dst}")
         dst_files.append(dst)
+    return dst_files
+
+
+dst_files = rename_architecture_specific_files()
+
 
 @atexit.register
 def cleanup_dst_files():
-    for dst in architechture_specific_files_dir:
+    for dst in dst_files:
         try:
             os.remove(dst)
         except FileNotFoundError:
             pass
-        
-architechture_specific_files_dir = 'cuda/bindings/_internal/'
+
 
 def do_cythonize(extensions):
     return cythonize(
@@ -254,7 +258,6 @@ def do_cythonize(extensions):
         ),
         **extra_cythonize_kwargs)
 
-rename_architecture_specific_files(architechture_specific_files_dir)
 
 sources_list = [
     # private
@@ -307,7 +310,6 @@ def finalize_options(self):
         find_packages(include=["cuda.cuda", "cuda.cuda.*", "cuda.cuda.bindings", "cuda.cuda.bindings._bindings", "cuda.cuda.bindings._lib", "cuda.cuda.bindings._lib.cyruntime", "cuda.cuda.bindings._internal", "tests"]),
         ["*.pxd", "*.pyx", "*.py", "*.h", "*.cpp"],
     ),
-    
     cmdclass=cmdclass,
     zip_safe=False,
-)
\ No newline at end of file
+)

From fa87ea80280bada27bc788ddc88e2f68989cd072 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 19 Oct 2024 05:19:40 +0000
Subject: [PATCH 14/34] strip input enumerator prefix

---
 cuda_bindings/cuda/bindings/nvjitlink.pyx | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index 9156d970c..bfc6acb22 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -32,15 +32,15 @@ class Result(_IntEnum):
 
 class InputType(_IntEnum):
     """See `nvJitLinkInputType`."""
-    INPUT_NONE = NVJITLINK_INPUT_NONE
-    INPUT_CUBIN = NVJITLINK_INPUT_CUBIN
-    INPUT_PTX = NVJITLINK_INPUT_PTX
-    INPUT_LTOIR = NVJITLINK_INPUT_LTOIR
-    INPUT_FATBIN = NVJITLINK_INPUT_FATBIN
-    INPUT_OBJECT = NVJITLINK_INPUT_OBJECT
-    INPUT_LIBRARY = NVJITLINK_INPUT_LIBRARY
-    INPUT_INDEX = NVJITLINK_INPUT_INDEX
-    INPUT_ANY = NVJITLINK_INPUT_ANY
+    NONE = NVJITLINK_INPUT_NONE
+    CUBIN = NVJITLINK_INPUT_CUBIN
+    PTX = NVJITLINK_INPUT_PTX
+    LTOIR = NVJITLINK_INPUT_LTOIR
+    FATBIN = NVJITLINK_INPUT_FATBIN
+    OBJECT = NVJITLINK_INPUT_OBJECT
+    LIBRARY = NVJITLINK_INPUT_LIBRARY
+    INDEX = NVJITLINK_INPUT_INDEX
+    ANY = NVJITLINK_INPUT_ANY
 
 
 ###############################################################################

From 5f13031103608b09b1a49fa5a63800076f4252a0 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 19 Oct 2024 05:50:38 +0000
Subject: [PATCH 15/34] hand-write nvJitLinkDestroy lowpp binding; turn on
 doxygen

---
 cuda_bindings/cuda/bindings/nvjitlink.pxd |   1 -
 cuda_bindings/cuda/bindings/nvjitlink.pyx | 117 +++++++++++++++++++++-
 2 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pxd b/cuda_bindings/cuda/bindings/nvjitlink.pxd
index 3cec24841..689820721 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pxd
@@ -29,7 +29,6 @@ ctypedef nvJitLinkInputType _InputType
 ###############################################################################
 
 cpdef intptr_t create(uint32_t num_options, options) except -1
-cpdef destroy(intptr_t handle)
 cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, intptr_t name)
 cpdef add_file(intptr_t handle, int input_type, intptr_t file_name)
 cpdef complete(intptr_t handle)
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index bfc6acb22..dedf49ea1 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -80,7 +80,29 @@ cdef int check_status(int status) except 1 nogil:
 # Wrapper functions
 ###############################################################################
 
+cpdef destroy(intptr_t handle):
+    cdef Handle h = <Handle>handle
+    with nogil:
+        status = nvJitLinkDestroy(&h)
+    check_status(status)
+
+
 cpdef intptr_t create(uint32_t num_options, options) except -1:
+    """nvJitLinkCreate creates an instance of nvJitLinkHandle with the given input options, and sets the output parameter ``handle``.
+
+    Args:
+        num_options (uint32_t): Number of options passed.
+        options (object): Array of size ``num_options`` of option strings. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``char*``.
+
+
+    Returns:
+        intptr_t: Address of nvJitLink handle.
+
+    .. seealso:: `nvJitLinkCreate`
+    """
     cdef list converted_options = [(<str?>(s)).encode() for s in options]
     cdef nullable_unique_ptr[ vector[char*] ] _options_
     get_char_ptrs(_options_, converted_options)
@@ -91,73 +113,158 @@ cpdef intptr_t create(uint32_t num_options, options) except -1:
     return <intptr_t>handle
 
 
-cpdef destroy(intptr_t handle):
-    with nogil:
-        status = nvJitLinkDestroy(<Handle*>handle)
-    check_status(status)
+cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, intptr_t name):
+    """nvJitLinkAddData adds data image to the link.
 
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        input_type (InputType): kind of input.
+        data (intptr_t): pointer to data image in memory.
+        size (size_t): size of the data.
+        name (intptr_t): name of input object.
 
-cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, intptr_t name):
+    .. seealso:: `nvJitLinkAddData`
+    """
     with nogil:
         status = nvJitLinkAddData(<Handle>handle, <_InputType>input_type, <const void*>data, size, <const char*>name)
     check_status(status)
 
 
 cpdef add_file(intptr_t handle, int input_type, intptr_t file_name):
+    """nvJitLinkAddFile reads data from file and links it in.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        input_type (InputType): kind of input.
+        file_name (intptr_t): name of file.
+
+    .. seealso:: `nvJitLinkAddFile`
+    """
     with nogil:
         status = nvJitLinkAddFile(<Handle>handle, <_InputType>input_type, <const char*>file_name)
     check_status(status)
 
 
 cpdef complete(intptr_t handle):
+    """nvJitLinkComplete does the actual link.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+
+    .. seealso:: `nvJitLinkComplete`
+    """
     with nogil:
         status = nvJitLinkComplete(<Handle>handle)
     check_status(status)
 
 
 cpdef get_linked_cubin_size(intptr_t handle, intptr_t size):
+    """nvJitLinkGetLinkedCubinSize gets the size of the linked cubin.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        size (intptr_t): Size of the linked cubin.
+
+    .. seealso:: `nvJitLinkGetLinkedCubinSize`
+    """
     with nogil:
         status = nvJitLinkGetLinkedCubinSize(<Handle>handle, <size_t*>size)
     check_status(status)
 
 
 cpdef get_linked_cubin(intptr_t handle, intptr_t cubin):
+    """nvJitLinkGetLinkedCubin gets the linked cubin.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        cubin (intptr_t): The linked cubin.
+
+    .. seealso:: `nvJitLinkGetLinkedCubin`
+    """
     with nogil:
         status = nvJitLinkGetLinkedCubin(<Handle>handle, <void*>cubin)
     check_status(status)
 
 
 cpdef get_linked_ptx_size(intptr_t handle, intptr_t size):
+    """nvJitLinkGetLinkedPtxSize gets the size of the linked ptx.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        size (intptr_t): Size of the linked PTX.
+
+    .. seealso:: `nvJitLinkGetLinkedPtxSize`
+    """
     with nogil:
         status = nvJitLinkGetLinkedPtxSize(<Handle>handle, <size_t*>size)
     check_status(status)
 
 
 cpdef get_linked_ptx(intptr_t handle, intptr_t ptx):
+    """nvJitLinkGetLinkedPtx gets the linked ptx.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        ptx (intptr_t): The linked PTX.
+
+    .. seealso:: `nvJitLinkGetLinkedPtx`
+    """
     with nogil:
         status = nvJitLinkGetLinkedPtx(<Handle>handle, <char*>ptx)
     check_status(status)
 
 
 cpdef get_error_log_size(intptr_t handle, intptr_t size):
+    """nvJitLinkGetErrorLogSize gets the size of the error log.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        size (intptr_t): Size of the error log.
+
+    .. seealso:: `nvJitLinkGetErrorLogSize`
+    """
     with nogil:
         status = nvJitLinkGetErrorLogSize(<Handle>handle, <size_t*>size)
     check_status(status)
 
 
 cpdef get_error_log(intptr_t handle, intptr_t log):
+    """nvJitLinkGetErrorLog puts any error messages in the log.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        log (intptr_t): The error log.
+
+    .. seealso:: `nvJitLinkGetErrorLog`
+    """
     with nogil:
         status = nvJitLinkGetErrorLog(<Handle>handle, <char*>log)
     check_status(status)
 
 
 cpdef get_info_log_size(intptr_t handle, intptr_t size):
+    """nvJitLinkGetInfoLogSize gets the size of the info log.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        size (intptr_t): Size of the info log.
+
+    .. seealso:: `nvJitLinkGetInfoLogSize`
+    """
     with nogil:
         status = nvJitLinkGetInfoLogSize(<Handle>handle, <size_t*>size)
     check_status(status)
 
 
 cpdef get_info_log(intptr_t handle, intptr_t log):
+    """nvJitLinkGetInfoLog puts any info messages in the log.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        log (intptr_t): The info log.
+
+    .. seealso:: `nvJitLinkGetInfoLog`
+    """
     with nogil:
         status = nvJitLinkGetInfoLog(<Handle>handle, <char*>log)
     check_status(status)

From f167588d6e93d02bbad0aca695f7e8a0f3256a02 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 19 Oct 2024 21:01:39 +0000
Subject: [PATCH 16/34] switch from NSTR to improved NSEQ; purge NSTR

---
 cuda_bindings/cuda/bindings/_internal/utils.pxd |  1 +
 cuda_bindings/cuda/bindings/_internal/utils.pyx |  7 ++++++-
 cuda_bindings/cuda/bindings/nvjitlink.pyx       | 13 +++++++------
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pxd b/cuda_bindings/cuda/bindings/_internal/utils.pxd
index be5d4ad61..75ec69dfb 100644
--- a/cuda_bindings/cuda/bindings/_internal/utils.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pxd
@@ -140,6 +140,7 @@ ctypedef fused ResT:
     int
     int32_t
     int64_t
+    char
 
 
 ctypedef fused PtrT:
diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pyx b/cuda_bindings/cuda/bindings/_internal/utils.pyx
index 904e08da0..a0b36bbd2 100644
--- a/cuda_bindings/cuda/bindings/_internal/utils.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pyx
@@ -103,7 +103,12 @@ cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj,
         nested_res_ptr.reset(nested_res_vec, True)
         nested_ptr.reset(nested_vec, True)
         for i, obj_i in enumerate(obj):
-            deref(nested_res_vec)[i] = obj_i
+            if ResT is char:
+                obj_i_bytes = (<str?>(obj_i)).encode()
+                obj_i_ptr = <char*>(obj_i_bytes)
+                deref(nested_res_vec)[i].assign(obj_i_ptr, obj_i_ptr + length)
+            else:
+                deref(nested_res_vec)[i] = obj_i
             deref(nested_vec)[i] = <intptr_t>(deref(nested_res_vec)[i].data())
     elif cpython.PySequence_Check(obj):
         length = len(obj)
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index dedf49ea1..031e1f86e 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -94,8 +94,10 @@ cpdef intptr_t create(uint32_t num_options, options) except -1:
         num_options (uint32_t): Number of options passed.
         options (object): Array of size ``num_options`` of option strings. It can be:
 
-            - an :class:`int` as the pointer address to the array, or
-            - a Python sequence of ``char*``.
+            - an :class:`int` as the pointer address to the nested sequence, or
+            - a Python sequence of :class:`int`\s, each of which is a pointer address
+              to a valid sequence of 'char', or
+            - a nested Python sequence of ``str``.
 
 
     Returns:
@@ -103,12 +105,11 @@ cpdef intptr_t create(uint32_t num_options, options) except -1:
 
     .. seealso:: `nvJitLinkCreate`
     """
-    cdef list converted_options = [(<str?>(s)).encode() for s in options]
-    cdef nullable_unique_ptr[ vector[char*] ] _options_
-    get_char_ptrs(_options_, converted_options)
+    cdef nested_resource[ char ] _options_
+    get_nested_resource_ptr[char](_options_, options, <char*>NULL)
     cdef Handle handle
     with nogil:
-        status = nvJitLinkCreate(&handle, num_options, <const char**>(_options_.data()))
+        status = nvJitLinkCreate(&handle, num_options, <const char**>(_options_.ptrs.data()))
     check_status(status)
     return <intptr_t>handle
 

From ea3d2262837b8c0361dce86f64995f17ee7938e9 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 19 Oct 2024 21:58:48 +0000
Subject: [PATCH 17/34] fix inner vector<char> size & clean-up

---
 cuda_bindings/cuda/bindings/_internal/utils.pxd |  1 -
 cuda_bindings/cuda/bindings/_internal/utils.pyx | 17 ++++-------------
 cuda_bindings/cuda/bindings/nvjitlink.pyx       |  2 +-
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pxd b/cuda_bindings/cuda/bindings/_internal/utils.pxd
index 75ec69dfb..2b45ced3b 100644
--- a/cuda_bindings/cuda/bindings/_internal/utils.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pxd
@@ -155,7 +155,6 @@ cdef cppclass nested_resource[T]:
 # accepts the output pointer as input to use the return value for exception propagation
 cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 1
 cdef int get_resource_ptrs(nullable_unique_ptr[ vector[PtrT*] ] &in_out_ptr, object obj, PtrT* __unused) except 1
-cdef int get_char_ptrs(nullable_unique_ptr[ vector[char*] ] &in_out_ptr, object obj) except 1
 cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj, ResT* __unused) except 1
 
 cdef bint is_nested_sequence(data)
diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pyx b/cuda_bindings/cuda/bindings/_internal/utils.pyx
index a0b36bbd2..9c5626155 100644
--- a/cuda_bindings/cuda/bindings/_internal/utils.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pyx
@@ -74,18 +74,6 @@ cdef int get_resource_ptrs(nullable_unique_ptr[ vector[PtrT*] ] &in_out_ptr, obj
     return 0
 
 
-cdef int get_char_ptrs(nullable_unique_ptr[ vector[char*] ] &in_out_ptr, object obj) except 1:
-    if cpython.PySequence_Check(obj):
-        vec = new vector[char*](len(obj))
-        in_out_ptr.reset(vec, True)
-        for i in range(len(obj)):
-            #__TODO__ is there a lifetime difference between this char* and some other ptrT*
-            deref(vec)[i] = obj[i]
-    else:
-        in_out_ptr.reset(<vector[char*]*><intptr_t>obj, False)
-    return 0
-
-
 cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj, ResT* __unused) except 1:
     cdef nullable_unique_ptr[ vector[intptr_t] ] nested_ptr
     cdef nullable_unique_ptr[ vector[vector[ResT]] ] nested_res_ptr
@@ -105,8 +93,11 @@ cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj,
         for i, obj_i in enumerate(obj):
             if ResT is char:
                 obj_i_bytes = (<str?>(obj_i)).encode()
+                str_len = <size_t>(len(obj_i_bytes)) + 1  # including null termination
+                deref(nested_res_vec)[i].resize(str_len)
                 obj_i_ptr = <char*>(obj_i_bytes)
-                deref(nested_res_vec)[i].assign(obj_i_ptr, obj_i_ptr + length)
+                # cast to size_t explicitly to work around a potentially Cython bug
+                deref(nested_res_vec)[i].assign(obj_i_ptr, obj_i_ptr + <size_t>str_len)
             else:
                 deref(nested_res_vec)[i] = obj_i
             deref(nested_vec)[i] = <intptr_t>(deref(nested_res_vec)[i].data())
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index 031e1f86e..7fc401500 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -7,7 +7,7 @@
 cimport cython  # NOQA
 
 from ._internal.utils cimport (get_resource_ptr, get_nested_resource_ptr, nested_resource, nullable_unique_ptr,
-                       get_buffer_pointer, get_resource_ptrs, get_char_ptrs)
+                               get_buffer_pointer, get_resource_ptrs)
 
 from enum import IntEnum as _IntEnum
 from libcpp.vector cimport vector

From c1d21daa76b3fef0987f3fda41011cdca06b0d18 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 19 Oct 2024 23:07:32 +0000
Subject: [PATCH 18/34] use autogen'd error code

---
 cuda_bindings/cuda/bindings/nvjitlink.pyx | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index 7fc401500..450323810 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -47,21 +47,12 @@ class InputType(_IntEnum):
 # Error handling
 ###############################################################################
 
-cdef dict STATUS={
-    NVJITLINK_SUCCESS                   : 'NVJITLINK_SUCCESS',
-    NVJITLINK_ERROR_UNRECOGNIZED_OPTION : 'NVJITLINK_ERROR_UNRECOGNIZED_OPTION',
-    NVJITLINK_ERROR_MISSING_ARCH        : 'NVJITLINK_ERROR_MISSING_ARCH',
-    NVJITLINK_ERROR_INVALID_INPUT       : 'NVJITLINK_ERROR_INVALID_INPUT',
-    NVJITLINK_ERROR_PTX_COMPILE         : 'NVJITLINK_ERROR_PTX_COMPILE',
-    NVJITLINK_ERROR_NVVM_COMPILE        : 'NVJITLINK_ERROR_NVVM_COMPILE',
-    NVJITLINK_ERROR_INTERNAL            : 'NVJITLINK_ERROR_INTERNAL'
-}
-
 class nvJitLinkError(Exception):
 
     def __init__(self, status):
         self.status = status
-        cdef str err = STATUS[status]
+        s = Result(status)
+        cdef str err = f"{s.name} ({s.value})"
         super(nvJitLinkError, self).__init__(err)
 
     def __reduce__(self):

From 0b836e00d30cc3631c39d9cb78671eab37120b83 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 19 Oct 2024 23:42:08 +0000
Subject: [PATCH 19/34] fix input arg conversions & output args; add
 nvJitLinkVersion

---
 .../cuda/bindings/_internal/nvjitlink.pxd     |  1 +
 .../bindings/_internal/nvjitlink_linux.pyx    | 21 +++++
 .../bindings/_internal/nvjitlink_windows.pyx  | 20 +++++
 cuda_bindings/cuda/bindings/cynvjitlink.pxd   |  1 +
 cuda_bindings/cuda/bindings/cynvjitlink.pyx   |  4 +
 cuda_bindings/cuda/bindings/nvjitlink.pxd     | 13 +--
 cuda_bindings/cuda/bindings/nvjitlink.pyx     | 79 ++++++++++++++-----
 7 files changed, 115 insertions(+), 24 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
index bca8867df..5f717d4d8 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
@@ -24,3 +24,4 @@ cdef nvJitLinkResult _nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* s
 cdef nvJitLinkResult _nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
 cdef nvJitLinkResult _nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
 cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
+cdef nvJitLinkResult _nvJitLinkVersion(unsigned int* major, unsigned int* minor) except* nogil
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index ab3d42be3..146832f0e 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -49,6 +49,7 @@ cdef void* __nvJitLinkGetErrorLogSize = NULL
 cdef void* __nvJitLinkGetErrorLog = NULL
 cdef void* __nvJitLinkGetInfoLogSize = NULL
 cdef void* __nvJitLinkGetInfoLog = NULL
+cdef void* __nvJitLinkVersion = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
@@ -181,6 +182,13 @@ cdef int _check_or_init_nvjitlink() except -1 nogil:
         if handle == NULL:
             handle = load_library(driver_ver)
         __nvJitLinkGetInfoLog = dlsym(handle, 'nvJitLinkGetInfoLog')
+    
+    global __nvJitLinkVersion
+    __nvJitLinkVersion = dlsym(RTLD_DEFAULT, 'nvJitLinkVersion')
+    if __nvJitLinkVersion == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvJitLinkVersion = dlsym(handle, 'nvJitLinkVersion')
 
     __py_nvjitlink_init = True
     return 0
@@ -235,6 +243,9 @@ cpdef dict _inspect_function_pointers():
     
     global __nvJitLinkGetInfoLog
     data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
+    
+    global __nvJitLinkVersion
+    data["__nvJitLinkVersion"] = <intptr_t>__nvJitLinkVersion
 
     func_ptrs = data
     return data
@@ -379,3 +390,13 @@ cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) exc
             raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
     return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
         handle, log)
+
+
+cdef nvJitLinkResult _nvJitLinkVersion(unsigned int* major, unsigned int* minor) except* nogil:
+    global __nvJitLinkVersion
+    _check_or_init_nvjitlink()
+    if __nvJitLinkVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkVersion is not found")
+    return (<nvJitLinkResult (*)(unsigned int*, unsigned int*) nogil>__nvJitLinkVersion)(
+        major, minor)
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index feddec3ca..a6a378f86 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -38,6 +38,7 @@ cdef void* __nvJitLinkGetErrorLogSize = NULL
 cdef void* __nvJitLinkGetErrorLog = NULL
 cdef void* __nvJitLinkGetInfoLogSize = NULL
 cdef void* __nvJitLinkGetInfoLog = NULL
+cdef void* __nvJitLinkVersion = NULL
 
 
 cdef inline list get_site_packages():
@@ -192,6 +193,12 @@ cdef int _check_or_init_nvjitlink() except -1 nogil:
             __nvJitLinkGetInfoLog = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkGetInfoLog')
         except:
             pass
+    
+        global __nvJitLinkVersion
+        try:
+            __nvJitLinkVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvJitLinkVersion')
+        except:
+            pass
 
     __py_nvjitlink_init = True
     return 0
@@ -246,6 +253,9 @@ cpdef dict _inspect_function_pointers():
     
     global __nvJitLinkGetInfoLog
     data["__nvJitLinkGetInfoLog"] = <intptr_t>__nvJitLinkGetInfoLog
+    
+    global __nvJitLinkVersion
+    data["__nvJitLinkVersion"] = <intptr_t>__nvJitLinkVersion
 
     func_ptrs = data
     return data
@@ -390,3 +400,13 @@ cdef nvJitLinkResult _nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) exc
             raise FunctionNotFoundError("function nvJitLinkGetInfoLog is not found")
     return (<nvJitLinkResult (*)(nvJitLinkHandle, char*) nogil>__nvJitLinkGetInfoLog)(
         handle, log)
+
+
+cdef nvJitLinkResult _nvJitLinkVersion(unsigned int* major, unsigned int* minor) except* nogil:
+    global __nvJitLinkVersion
+    _check_or_init_nvjitlink()
+    if __nvJitLinkVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvJitLinkVersion is not found")
+    return (<nvJitLinkResult (*)(unsigned int*, unsigned int*) nogil>__nvJitLinkVersion)(
+        major, minor)
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pxd b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
index 45c80d3af..6c2194736 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
@@ -66,3 +66,4 @@ cdef nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t* si
 cdef nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char* log) except* nogil
 cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* size) except* nogil
 cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil
+cdef nvJitLinkResult nvJitLinkVersion(unsigned int* major, unsigned int* minor) except* nogil
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pyx b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
index 3d55097b0..2a8695434 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
@@ -61,3 +61,7 @@ cdef nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t* siz
 
 cdef nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char* log) except* nogil:
     return _nvjitlink._nvJitLinkGetInfoLog(handle, log)
+
+
+cdef nvJitLinkResult nvJitLinkVersion(unsigned int* major, unsigned int* minor) except* nogil:
+    return _nvjitlink._nvJitLinkVersion(major, minor)
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pxd b/cuda_bindings/cuda/bindings/nvjitlink.pxd
index 689820721..2b8841cd5 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pxd
@@ -29,14 +29,15 @@ ctypedef nvJitLinkInputType _InputType
 ###############################################################################
 
 cpdef intptr_t create(uint32_t num_options, options) except -1
-cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, intptr_t name)
-cpdef add_file(intptr_t handle, int input_type, intptr_t file_name)
+cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, name)
+cpdef add_file(intptr_t handle, int input_type, file_name)
 cpdef complete(intptr_t handle)
-cpdef get_linked_cubin_size(intptr_t handle, intptr_t size)
+cpdef size_t get_linked_cubin_size(intptr_t handle) except? 0
 cpdef get_linked_cubin(intptr_t handle, intptr_t cubin)
-cpdef get_linked_ptx_size(intptr_t handle, intptr_t size)
+cpdef size_t get_linked_ptx_size(intptr_t handle) except? 0
 cpdef get_linked_ptx(intptr_t handle, intptr_t ptx)
-cpdef get_error_log_size(intptr_t handle, intptr_t size)
+cpdef size_t get_error_log_size(intptr_t handle) except? 0
 cpdef get_error_log(intptr_t handle, intptr_t log)
-cpdef get_info_log_size(intptr_t handle, intptr_t size)
+cpdef size_t get_info_log_size(intptr_t handle) except? 0
 cpdef get_info_log(intptr_t handle, intptr_t log)
+cpdef tuple version()
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index 450323810..b75596f6c 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -105,7 +105,7 @@ cpdef intptr_t create(uint32_t num_options, options) except -1:
     return <intptr_t>handle
 
 
-cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, intptr_t name):
+cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, name):
     """nvJitLinkAddData adds data image to the link.
 
     Args:
@@ -113,27 +113,35 @@ cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, intp
         input_type (InputType): kind of input.
         data (intptr_t): pointer to data image in memory.
         size (size_t): size of the data.
-        name (intptr_t): name of input object.
+        name (str): name of input object.
 
     .. seealso:: `nvJitLinkAddData`
     """
+    if not isinstance(name, str):
+        raise TypeError("name must be a Python str")
+    cdef bytes _temp_name_ = (<str>name).encode()
+    cdef char* _name_ = _temp_name_
     with nogil:
-        status = nvJitLinkAddData(<Handle>handle, <_InputType>input_type, <const void*>data, size, <const char*>name)
+        status = nvJitLinkAddData(<Handle>handle, <_InputType>input_type, <const void*>data, size, <const char*>_name_)
     check_status(status)
 
 
-cpdef add_file(intptr_t handle, int input_type, intptr_t file_name):
+cpdef add_file(intptr_t handle, int input_type, file_name):
     """nvJitLinkAddFile reads data from file and links it in.
 
     Args:
         handle (intptr_t): nvJitLink handle.
         input_type (InputType): kind of input.
-        file_name (intptr_t): name of file.
+        file_name (str): name of file.
 
     .. seealso:: `nvJitLinkAddFile`
     """
+    if not isinstance(file_name, str):
+        raise TypeError("file_name must be a Python str")
+    cdef bytes _temp_file_name_ = (<str>file_name).encode()
+    cdef char* _file_name_ = _temp_file_name_
     with nogil:
-        status = nvJitLinkAddFile(<Handle>handle, <_InputType>input_type, <const char*>file_name)
+        status = nvJitLinkAddFile(<Handle>handle, <_InputType>input_type, <const char*>_file_name_)
     check_status(status)
 
 
@@ -150,18 +158,22 @@ cpdef complete(intptr_t handle):
     check_status(status)
 
 
-cpdef get_linked_cubin_size(intptr_t handle, intptr_t size):
+cpdef size_t get_linked_cubin_size(intptr_t handle) except? 0:
     """nvJitLinkGetLinkedCubinSize gets the size of the linked cubin.
 
     Args:
         handle (intptr_t): nvJitLink handle.
-        size (intptr_t): Size of the linked cubin.
+
+    Returns:
+        size_t: Size of the linked cubin.
 
     .. seealso:: `nvJitLinkGetLinkedCubinSize`
     """
+    cdef size_t size
     with nogil:
-        status = nvJitLinkGetLinkedCubinSize(<Handle>handle, <size_t*>size)
+        status = nvJitLinkGetLinkedCubinSize(<Handle>handle, &size)
     check_status(status)
+    return size
 
 
 cpdef get_linked_cubin(intptr_t handle, intptr_t cubin):
@@ -178,18 +190,22 @@ cpdef get_linked_cubin(intptr_t handle, intptr_t cubin):
     check_status(status)
 
 
-cpdef get_linked_ptx_size(intptr_t handle, intptr_t size):
+cpdef size_t get_linked_ptx_size(intptr_t handle) except? 0:
     """nvJitLinkGetLinkedPtxSize gets the size of the linked ptx.
 
     Args:
         handle (intptr_t): nvJitLink handle.
-        size (intptr_t): Size of the linked PTX.
+
+    Returns:
+        size_t: Size of the linked PTX.
 
     .. seealso:: `nvJitLinkGetLinkedPtxSize`
     """
+    cdef size_t size
     with nogil:
-        status = nvJitLinkGetLinkedPtxSize(<Handle>handle, <size_t*>size)
+        status = nvJitLinkGetLinkedPtxSize(<Handle>handle, &size)
     check_status(status)
+    return size
 
 
 cpdef get_linked_ptx(intptr_t handle, intptr_t ptx):
@@ -206,18 +222,22 @@ cpdef get_linked_ptx(intptr_t handle, intptr_t ptx):
     check_status(status)
 
 
-cpdef get_error_log_size(intptr_t handle, intptr_t size):
+cpdef size_t get_error_log_size(intptr_t handle) except? 0:
     """nvJitLinkGetErrorLogSize gets the size of the error log.
 
     Args:
         handle (intptr_t): nvJitLink handle.
-        size (intptr_t): Size of the error log.
+
+    Returns:
+        size_t: Size of the error log.
 
     .. seealso:: `nvJitLinkGetErrorLogSize`
     """
+    cdef size_t size
     with nogil:
-        status = nvJitLinkGetErrorLogSize(<Handle>handle, <size_t*>size)
+        status = nvJitLinkGetErrorLogSize(<Handle>handle, &size)
     check_status(status)
+    return size
 
 
 cpdef get_error_log(intptr_t handle, intptr_t log):
@@ -234,18 +254,22 @@ cpdef get_error_log(intptr_t handle, intptr_t log):
     check_status(status)
 
 
-cpdef get_info_log_size(intptr_t handle, intptr_t size):
+cpdef size_t get_info_log_size(intptr_t handle) except? 0:
     """nvJitLinkGetInfoLogSize gets the size of the info log.
 
     Args:
         handle (intptr_t): nvJitLink handle.
-        size (intptr_t): Size of the info log.
+
+    Returns:
+        size_t: Size of the info log.
 
     .. seealso:: `nvJitLinkGetInfoLogSize`
     """
+    cdef size_t size
     with nogil:
-        status = nvJitLinkGetInfoLogSize(<Handle>handle, <size_t*>size)
+        status = nvJitLinkGetInfoLogSize(<Handle>handle, &size)
     check_status(status)
+    return size
 
 
 cpdef get_info_log(intptr_t handle, intptr_t log):
@@ -260,3 +284,22 @@ cpdef get_info_log(intptr_t handle, intptr_t log):
     with nogil:
         status = nvJitLinkGetInfoLog(<Handle>handle, <char*>log)
     check_status(status)
+
+
+cpdef tuple version():
+    """nvJitLinkVersion returns the current version of nvJitLink.
+
+    Returns:
+        A 2-tuple containing:
+
+        - unsigned int: The major version.
+        - unsigned int: The minor version.
+
+    .. seealso:: `nvJitLinkVersion`
+    """
+    cdef unsigned int major
+    cdef unsigned int minor
+    with nogil:
+        status = nvJitLinkVersion(&major, &minor)
+    check_status(status)
+    return (major, minor)

From fab638245c06d79bc18defc2dce6edda85196d55 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 20 Oct 2024 19:16:26 +0000
Subject: [PATCH 20/34] start fixing tests

---
 cuda_bindings/tests/test_nvjitlink.py | 51 +++++++++++++--------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py
index 37129e4a2..ec475b04a 100644
--- a/cuda_bindings/tests/test_nvjitlink.py
+++ b/cuda_bindings/tests/test_nvjitlink.py
@@ -1,23 +1,21 @@
-import pytest
-from cuda.bindings import nvjitlink
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-dir(nvjitlink)
+import pytest
 
-def test_create_no_arch_error():
-    # nvjitlink expects at least the architecture to be specified.
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_MISSING_ARCH error"):
-        nvjitlink.create()
+from cuda.bindings import nvjitlink
 
 
 def test_invalid_arch_error():
     # sm_XX is not a valid architecture
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
-        nvjitlink.create("-arch=sm_XX")
+    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
+        nvjitlink.create(1, ["-arch=sm_XX"])
 
 
 def test_unrecognized_option_error():
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_UNRECOGNIZED_OPTION error"):
-        nvjitlink.create("-fictitious_option")
+    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
+        nvjitlink.create(1, ["-fictitious_option"])
 
 
 def test_invalid_option_type_error():
@@ -41,17 +39,17 @@ def test_complete_empty():
     "input_file,input_type",
     [
         ("device_functions_cubin", nvjitlink.InputType.CUBIN),
-        ("device_functions_fatbin", InputType.FATBIN),
-        ("device_functions_ptx", InputType.PTX),
-        ("device_functions_object", InputType.OBJECT),
-        ("device_functions_archive", InputType.LIBRARY),
+        ("device_functions_fatbin", nvjitlink.InputType.FATBIN),
+        ("device_functions_ptx", nvjitlink.InputType.PTX),
+        ("device_functions_object", nvjitlink.InputType.OBJECT),
+        ("device_functions_archive", nvjitlink.InputType.LIBRARY),
     ],
 )
 def test_add_file(input_file, input_type, gpu_arch_flag, request):
     filename, data = request.getfixturevalue(input_file)
 
     handle = nvjitlink.create(gpu_arch_flag)
-    nvjitlink.add_data(handle, input_type.value, data, filename)
+    nvjitlink.add_data(handle, input_type, data, filename)
     nvjitlink.destroy(handle)
 
 
@@ -62,14 +60,14 @@ def test_add_file_lto(device_functions_ltoir_object, gpu_arch_flag):
     filename, data = device_functions_ltoir_object
 
     handle = nvjitlink.create(gpu_arch_flag, "-lto")
-    nvjitlink.add_data(handle, InputType.OBJECT.value, data, filename)
+    nvjitlink.add_data(handle, nvjitlink.InputType.OBJECT, data, filename)
     nvjitlink.destroy(handle)
 
 
 def test_get_error_log(undefined_extern_cubin, gpu_arch_flag):
     handle = nvjitlink.create(gpu_arch_flag)
     filename, data = undefined_extern_cubin
-    input_type = InputType.CUBIN.value
+    input_type = nvjitlink.InputType.CUBIN
     nvjitlink.add_data(handle, input_type, data, filename)
     with pytest.raises(RuntimeError):
         nvjitlink.complete(handle)
@@ -84,7 +82,7 @@ def test_get_error_log(undefined_extern_cubin, gpu_arch_flag):
 def test_get_info_log(device_functions_cubin, gpu_arch_flag):
     handle = nvjitlink.create(gpu_arch_flag)
     filename, data = device_functions_cubin
-    input_type = InputType.CUBIN.value
+    input_type = nvjitlink.InputType.CUBIN
     nvjitlink.add_data(handle, input_type, data, filename)
     nvjitlink.complete(handle)
     info_log = nvjitlink.get_info_log(handle)
@@ -96,7 +94,7 @@ def test_get_info_log(device_functions_cubin, gpu_arch_flag):
 def test_get_linked_cubin(device_functions_cubin, gpu_arch_flag):
     handle = nvjitlink.create(gpu_arch_flag)
     filename, data = device_functions_cubin
-    input_type = InputType.CUBIN.value
+    input_type = nvjitlink.InputType.CUBIN
     nvjitlink.add_data(handle, input_type, data, filename)
     nvjitlink.complete(handle)
     cubin = nvjitlink.get_linked_cubin(handle)
@@ -111,7 +109,7 @@ def test_get_linked_cubin_link_not_complete_error(
 ):
     handle = nvjitlink.create(gpu_arch_flag)
     filename, data = device_functions_cubin
-    input_type = InputType.CUBIN.value
+    input_type = nvjitlink.InputType.CUBIN
     nvjitlink.add_data(handle, input_type, data, filename)
     with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
         nvjitlink.get_linked_cubin(handle)
@@ -124,7 +122,7 @@ def test_get_linked_cubin_from_lto(device_functions_ltoir_object, gpu_arch_flag)
     # containing an LTOIR container, because that is what NVCC produces when
     # LTO is requested. So we need to use the OBJECT input type, and the linker
     # retrieves the LTO IR from it because we passed the -lto flag.
-    input_type = InputType.OBJECT.value
+    input_type = nvjitlink.InputType.OBJECT
     handle = nvjitlink.create(gpu_arch_flag, "-lto")
     nvjitlink.add_data(handle, input_type, data, filename)
     nvjitlink.complete(handle)
@@ -141,7 +139,7 @@ def test_get_linked_ptx_from_lto(device_functions_ltoir_object, gpu_arch_flag):
     # containing an LTOIR container, because that is what NVCC produces when
     # LTO is requested. So we need to use the OBJECT input type, and the linker
     # retrieves the LTO IR from it because we passed the -lto flag.
-    input_type = InputType.OBJECT.value
+    input_type = nvjitlink.InputType.OBJECT
     handle = nvjitlink.create(gpu_arch_flag, "-lto", "-ptx")
     nvjitlink.add_data(handle, input_type, data, filename)
     nvjitlink.complete(handle)
@@ -154,7 +152,7 @@ def test_get_linked_ptx_link_not_complete_error(
 ):
     handle = nvjitlink.create(gpu_arch_flag, "-lto", "-ptx")
     filename, data = device_functions_ltoir_object
-    input_type = InputType.OBJECT.value
+    input_type = nvjitlink.InputType.OBJECT
     nvjitlink.add_data(handle, input_type, data, filename)
     with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
         nvjitlink.get_linked_ptx(handle)
@@ -162,5 +160,6 @@ def test_get_linked_ptx_link_not_complete_error(
 
 
 def test_package_version():
-    assert pynvjitlink.__version__ is not None
-    assert len(str(pynvjitlink.__version__)) > 0
\ No newline at end of file
+    ver = nvjitlink.version()
+    assert len(ver) == 2
+    assert ver >= (12, 0)

From 7fde00efc6a2b71a9859bb28326ba6cc938254b2 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 20 Oct 2024 23:21:15 +0000
Subject: [PATCH 21/34] clean up a bit

---
 cuda_bindings/cuda/bindings/cynvjitlink.pxd | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pxd b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
index 6c2194736..3c22d939e 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
@@ -37,15 +37,6 @@ ctypedef enum nvJitLinkInputType "nvJitLinkInputType":
 
 
 # types
-cdef extern from *:
-    """
-    #include <driver_types.h>
-    #include <library_types.h>
-    #include <cuComplex.h>
-    """
-    ctypedef void* cudaStream_t 'cudaStream_t'
-
-
 ctypedef void* nvJitLinkHandle 'nvJitLinkHandle'
 
 

From 22984359527374e8f384079a43ba6290ae9f996b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 21 Oct 2024 00:48:37 +0000
Subject: [PATCH 22/34] add destroy docstring

---
 cuda_bindings/cuda/bindings/nvjitlink.pyx | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index b75596f6c..6cc67d7e8 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -72,6 +72,13 @@ cdef int check_status(int status) except 1 nogil:
 ###############################################################################
 
 cpdef destroy(intptr_t handle):
+    """nvJitLinkDestroy frees the memory associated with the given handle.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+
+    .. seealso:: `nvJitLinkDestroy`
+    """
     cdef Handle h = <Handle>handle
     with nogil:
         status = nvJitLinkDestroy(&h)

From cb6c5b4068fc5c18ca9f949cce6414a9c1bf7496 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 21 Oct 2024 17:42:01 -0700
Subject: [PATCH 23/34] update tests; regen bindings

---
 cuda_bindings/tests/test_nvjitlink.py | 183 ++++++++++----------------
 1 file changed, 67 insertions(+), 116 deletions(-)

diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py
index ec475b04a..c92a100e7 100644
--- a/cuda_bindings/tests/test_nvjitlink.py
+++ b/cuda_bindings/tests/test_nvjitlink.py
@@ -3,14 +3,43 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import pytest
-
+import os
 from cuda.bindings import nvjitlink
 
+ptx_code = """
+.version 8.5
+.target sm_90
+.address_size 64
 
-def test_invalid_arch_error():
-    # sm_XX is not a valid architecture
-    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
-        nvjitlink.create(1, ["-arch=sm_XX"])
+.visible .entry _Z6kernelPi(
+    .param .u64 _Z6kernelPi_param_0
+)
+{
+    .reg .pred  %p<2>;
+    .reg .b32   %r<3>;
+    .reg .b64   %rd<3>;
+
+    ld.param.u64    %rd1, [_Z6kernelPi_param_0];
+    cvta.to.global.u64  %rd2, %rd1;
+    mov.u32     %r1, %tid.x;
+    st.global.u32   [%rd2+0], %r1;
+    ret;
+}
+"""
+
+minimal_kernel = """
+.version 6.4
+.target sm_75
+.address_size 64
+
+.visible .entry _kernel() {
+    ret;
+}
+"""
+
+# Convert PTX code to bytes
+ptx_bytes = ptx_code.encode('utf-8')
+minimal_kernel_bytes = minimal_kernel.encode('utf-8')
 
 
 def test_unrecognized_option_error():
@@ -18,144 +47,66 @@ def test_unrecognized_option_error():
         nvjitlink.create(1, ["-fictitious_option"])
 
 
-def test_invalid_option_type_error():
-    with pytest.raises(TypeError, match="Expecting only strings"):
-        nvjitlink.create("-arch", 53)
+def test_invalid_arch_error():
+    # sm_XX is not a valid architecture
+    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
+        nvjitlink.create(1, ["-arch=sm_XX"])
 
 
 def test_create_and_destroy():
-    handle = nvjitlink.create("-arch=sm_53")
+    handle = nvjitlink.create(1, ["-arch=sm_53"])
     assert handle != 0
     nvjitlink.destroy(handle)
 
 
 def test_complete_empty():
-    handle = nvjitlink.create("-arch=sm_75")
+    handle = nvjitlink.create(1, ["-arch=sm_90"])
     nvjitlink.complete(handle)
     nvjitlink.destroy(handle)
 
+def test_add_data():
+    handle = nvjitlink.create(1, ["-arch=sm_90"])
+    data = ptx_bytes
+    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, data, len(data), "test_data")
 
-@pytest.mark.parametrize(
-    "input_file,input_type",
-    [
-        ("device_functions_cubin", nvjitlink.InputType.CUBIN),
-        ("device_functions_fatbin", nvjitlink.InputType.FATBIN),
-        ("device_functions_ptx", nvjitlink.InputType.PTX),
-        ("device_functions_object", nvjitlink.InputType.OBJECT),
-        ("device_functions_archive", nvjitlink.InputType.LIBRARY),
-    ],
-)
-def test_add_file(input_file, input_type, gpu_arch_flag, request):
-    filename, data = request.getfixturevalue(input_file)
-
-    handle = nvjitlink.create(gpu_arch_flag)
-    nvjitlink.add_data(handle, input_type, data, filename)
-    nvjitlink.destroy(handle)
-
-
-# We test the LTO input case separately as it requires the `-lto` flag. The
-# OBJECT input type is used because the LTO-IR container is packaged in an ELF
-# object when produced by NVCC.
-def test_add_file_lto(device_functions_ltoir_object, gpu_arch_flag):
-    filename, data = device_functions_ltoir_object
 
-    handle = nvjitlink.create(gpu_arch_flag, "-lto")
-    nvjitlink.add_data(handle, nvjitlink.InputType.OBJECT, data, filename)
-    nvjitlink.destroy(handle)
+def test_add_file():
+    handle = nvjitlink.create(1, ["-arch=sm_90"])
+    file_path = "test_file.cubin"
+    with open (file_path, "wb") as f:
+        f.write(ptx_bytes)
 
-
-def test_get_error_log(undefined_extern_cubin, gpu_arch_flag):
-    handle = nvjitlink.create(gpu_arch_flag)
-    filename, data = undefined_extern_cubin
-    input_type = nvjitlink.InputType.CUBIN
-    nvjitlink.add_data(handle, input_type, data, filename)
-    with pytest.raises(RuntimeError):
-        nvjitlink.complete(handle)
-    error_log = nvjitlink.get_error_log(handle)
-    nvjitlink.destroy(handle)
-    assert (
-        "Undefined reference to '_Z5undefff' "
-        "in 'undefined_extern.cubin'" in error_log
-    )
-
-
-def test_get_info_log(device_functions_cubin, gpu_arch_flag):
-    handle = nvjitlink.create(gpu_arch_flag)
-    filename, data = device_functions_cubin
-    input_type = nvjitlink.InputType.CUBIN
-    nvjitlink.add_data(handle, input_type, data, filename)
+    nvjitlink.add_file(handle, nvjitlink.InputType.ANY, str(file_path))
     nvjitlink.complete(handle)
-    info_log = nvjitlink.get_info_log(handle)
     nvjitlink.destroy(handle)
-    # Info log is empty
-    assert "" == info_log
+    
+    os.remove(file_path)
 
 
-def test_get_linked_cubin(device_functions_cubin, gpu_arch_flag):
-    handle = nvjitlink.create(gpu_arch_flag)
-    filename, data = device_functions_cubin
-    input_type = nvjitlink.InputType.CUBIN
-    nvjitlink.add_data(handle, input_type, data, filename)
+def test_get_error_log():
+    handle = nvjitlink.create(1, ["-arch=sm_90"])
     nvjitlink.complete(handle)
-    cubin = nvjitlink.get_linked_cubin(handle)
+    log_size = nvjitlink.get_error_log_size(handle)
+    log = nvjitlink.get_error_log(handle)
+    assert len(log) == log_size
     nvjitlink.destroy(handle)
 
-    # Just check we got something that looks like an ELF
-    assert cubin[:4] == b"\x7fELF"
-
-
-def test_get_linked_cubin_link_not_complete_error(
-    device_functions_cubin, gpu_arch_flag
-):
-    handle = nvjitlink.create(gpu_arch_flag)
-    filename, data = device_functions_cubin
-    input_type = nvjitlink.InputType.CUBIN
-    nvjitlink.add_data(handle, input_type, data, filename)
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
-        nvjitlink.get_linked_cubin(handle)
-    nvjitlink.destroy(handle)
 
-
-def test_get_linked_cubin_from_lto(device_functions_ltoir_object, gpu_arch_flag):
-    filename, data = device_functions_ltoir_object
-    # device_functions_ltoir_object is a host object containing a fatbin
-    # containing an LTOIR container, because that is what NVCC produces when
-    # LTO is requested. So we need to use the OBJECT input type, and the linker
-    # retrieves the LTO IR from it because we passed the -lto flag.
-    input_type = nvjitlink.InputType.OBJECT
-    handle = nvjitlink.create(gpu_arch_flag, "-lto")
-    nvjitlink.add_data(handle, input_type, data, filename)
+def test_get_info_log():
+    handle = nvjitlink.create(1, ["-arch=sm_90"])
     nvjitlink.complete(handle)
-    cubin = nvjitlink.get_linked_cubin(handle)
+    log_size = nvjitlink.get_info_log_size(handle)
+    log = nvjitlink.get_info_log(handle)
+    assert len(log) == log_size
     nvjitlink.destroy(handle)
 
-    # Just check we got something that looks like an ELF
-    assert cubin[:4] == b"\x7fELF"
 
-
-def test_get_linked_ptx_from_lto(device_functions_ltoir_object, gpu_arch_flag):
-    filename, data = device_functions_ltoir_object
-    # device_functions_ltoir_object is a host object containing a fatbin
-    # containing an LTOIR container, because that is what NVCC produces when
-    # LTO is requested. So we need to use the OBJECT input type, and the linker
-    # retrieves the LTO IR from it because we passed the -lto flag.
-    input_type = nvjitlink.InputType.OBJECT
-    handle = nvjitlink.create(gpu_arch_flag, "-lto", "-ptx")
-    nvjitlink.add_data(handle, input_type, data, filename)
+def test_get_linked_cubin():
+    handle = nvjitlink.create(1, ["-arch=sm_90"])
     nvjitlink.complete(handle)
-    nvjitlink.get_linked_ptx(handle)
-    nvjitlink.destroy(handle)
-
-
-def test_get_linked_ptx_link_not_complete_error(
-    device_functions_ltoir_object, gpu_arch_flag
-):
-    handle = nvjitlink.create(gpu_arch_flag, "-lto", "-ptx")
-    filename, data = device_functions_ltoir_object
-    input_type = nvjitlink.InputType.OBJECT
-    nvjitlink.add_data(handle, input_type, data, filename)
-    with pytest.raises(RuntimeError, match="NVJITLINK_ERROR_INTERNAL error"):
-        nvjitlink.get_linked_ptx(handle)
+    cubin_size = nvjitlink.get_linked_cubin_size(handle)
+    cubin = nvjitlink.get_linked_cubin(handle)
+    assert len(cubin) == cubin_size
     nvjitlink.destroy(handle)
 
 

From e941305c9eddded4ce296d1125c13310cffdbdd8 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 21 Oct 2024 18:44:41 -0700
Subject: [PATCH 24/34] update test file TODO add some coverage for
 GetLinkedPtx, and expected errors once bindings are checked in

---
 cuda_bindings/tests/test_nvjitlink.py | 83 +++++++++++++++------------
 1 file changed, 47 insertions(+), 36 deletions(-)

diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py
index c92a100e7..605b98bfe 100644
--- a/cuda_bindings/tests/test_nvjitlink.py
+++ b/cuda_bindings/tests/test_nvjitlink.py
@@ -1,10 +1,16 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
 #
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
 
 import pytest
 import os
-from cuda.bindings import nvjitlink
+import cuda.bindings
+
+
 
 ptx_code = """
 .version 8.5
@@ -41,76 +47,81 @@
 ptx_bytes = ptx_code.encode('utf-8')
 minimal_kernel_bytes = minimal_kernel.encode('utf-8')
 
-
 def test_unrecognized_option_error():
-    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
-        nvjitlink.create(1, ["-fictitious_option"])
+    with pytest.raises(cuda.bindings.nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
+        cuda.bindings.nvjitlink.create(1, ["-fictitious_option"])
 
 
 def test_invalid_arch_error():
     # sm_XX is not a valid architecture
-    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
-        nvjitlink.create(1, ["-arch=sm_XX"])
+    with pytest.raises(cuda.bindings.nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
+        cuda.bindings.nvjitlink.create(1, ["-arch=sm_XX"])
 
 
 def test_create_and_destroy():
-    handle = nvjitlink.create(1, ["-arch=sm_53"])
+    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_53"])
     assert handle != 0
-    nvjitlink.destroy(handle)
+    cuda.bindings.nvjitlink.destroy(handle)
 
 
 def test_complete_empty():
-    handle = nvjitlink.create(1, ["-arch=sm_90"])
-    nvjitlink.complete(handle)
-    nvjitlink.destroy(handle)
+    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_90"])
+    cuda.bindings.nvjitlink.complete(handle)
+    cuda.bindings.nvjitlink.destroy(handle)
 
 def test_add_data():
-    handle = nvjitlink.create(1, ["-arch=sm_90"])
+    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_90"])
     data = ptx_bytes
-    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, data, len(data), "test_data")
+    cuda.bindings.nvjitlink.add_data(handle, cuda.bindings.nvjitlink.InputType.ANY, data, len(data), "test_data")
+    cuda.bindings.nvjitlink.complete(handle)
+    cuda.bindings.nvjitlink.destroy(handle)
 
 
 def test_add_file():
-    handle = nvjitlink.create(1, ["-arch=sm_90"])
+    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_90"])
     file_path = "test_file.cubin"
     with open (file_path, "wb") as f:
         f.write(ptx_bytes)
 
-    nvjitlink.add_file(handle, nvjitlink.InputType.ANY, str(file_path))
-    nvjitlink.complete(handle)
-    nvjitlink.destroy(handle)
+    cuda.bindings.nvjitlink.add_file(handle, cuda.bindings.nvjitlink.InputType.ANY, str(file_path))
+    cuda.bindings.nvjitlink.complete(handle)
+    cuda.bindings.nvjitlink.destroy(handle)
     
     os.remove(file_path)
 
 
 def test_get_error_log():
-    handle = nvjitlink.create(1, ["-arch=sm_90"])
-    nvjitlink.complete(handle)
-    log_size = nvjitlink.get_error_log_size(handle)
-    log = nvjitlink.get_error_log(handle)
+    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_90"])
+    cuda.bindings.nvjitlink.complete(handle)
+    log_size = cuda.bindings.nvjitlink.get_error_log_size(handle)
+    log = bytearray(log_size)
+    cuda.bindings.nvjitlink.get_error_log(handle, log)
     assert len(log) == log_size
-    nvjitlink.destroy(handle)
+    cuda.bindings.nvjitlink.destroy(handle)
 
 
 def test_get_info_log():
-    handle = nvjitlink.create(1, ["-arch=sm_90"])
-    nvjitlink.complete(handle)
-    log_size = nvjitlink.get_info_log_size(handle)
-    log = nvjitlink.get_info_log(handle)
+    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_90"])
+    cuda.bindings.nvjitlink.complete(handle)
+    log_size = cuda.bindings.nvjitlink.get_info_log_size(handle)
+    log = bytearray(log_size)
+    cuda.bindings.nvjitlink.get_info_log(handle, log)
     assert len(log) == log_size
-    nvjitlink.destroy(handle)
+    cuda.bindings.nvjitlink.destroy(handle)
 
 
 def test_get_linked_cubin():
-    handle = nvjitlink.create(1, ["-arch=sm_90"])
-    nvjitlink.complete(handle)
-    cubin_size = nvjitlink.get_linked_cubin_size(handle)
-    cubin = nvjitlink.get_linked_cubin(handle)
+    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_90"])
+    cuda.bindings.nvjitlink.complete(handle)
+    cubin_size = cuda.bindings.nvjitlink.get_linked_cubin_size(handle)
+    cubin = bytearray(cubin_size)
+    cuda.bindings.nvjitlink.get_linked_cubin(handle, cubin)
     assert len(cubin) == cubin_size
-    nvjitlink.destroy(handle)
+    cuda.bindings.nvjitlink.destroy(handle)
 
+#TODO add a ptx test
 
 def test_package_version():
-    ver = nvjitlink.version()
+    ver = cuda.bindings.nvjitlink.version()
     assert len(ver) == 2
-    assert ver >= (12, 0)
+    assert ver >= (12, 0)
\ No newline at end of file

From df605e99626adb63960ca33b0ddb64f0bce8d026 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 22 Oct 2024 09:34:51 -0700
Subject: [PATCH 25/34] update test file

---
 cuda_bindings/tests/test_nvjitlink.py | 104 ++++++++++++++------------
 1 file changed, 55 insertions(+), 49 deletions(-)

diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py
index 605b98bfe..6524c4a88 100644
--- a/cuda_bindings/tests/test_nvjitlink.py
+++ b/cuda_bindings/tests/test_nvjitlink.py
@@ -8,11 +8,10 @@
 
 import pytest
 import os
-import cuda.bindings
+from cuda.bindings import nvjitlink
 
 
-
-ptx_code = """
+ptx_kernel = """
 .version 8.5
 .target sm_90
 .address_size 64
@@ -33,95 +32,102 @@
 }
 """
 
-minimal_kernel = """
-.version 6.4
-.target sm_75
+minimal_ptx_kernel = """
+.version 8.5
+.target sm_90
 .address_size 64
 
-.visible .entry _kernel() {
+.func _MinimalKernel()
+{
     ret;
 }
 """
 
-# Convert PTX code to bytes
-ptx_bytes = ptx_code.encode('utf-8')
-minimal_kernel_bytes = minimal_kernel.encode('utf-8')
+ptx_kernel_bytes = ptx_kernel.encode('utf-8')
+minimal_ptx_kernel_bytes = minimal_ptx_kernel.encode('utf-8')
 
 def test_unrecognized_option_error():
-    with pytest.raises(cuda.bindings.nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
-        cuda.bindings.nvjitlink.create(1, ["-fictitious_option"])
+    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
+        nvjitlink.create(1, ["-fictitious_option"])
 
 
 def test_invalid_arch_error():
-    # sm_XX is not a valid architecture
-    with pytest.raises(cuda.bindings.nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
-        cuda.bindings.nvjitlink.create(1, ["-arch=sm_XX"])
+    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
+        nvjitlink.create(1, ["-arch=sm_XX"])
 
 
 def test_create_and_destroy():
-    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_53"])
+    handle = nvjitlink.create(1, ["-arch=sm_53"])
     assert handle != 0
-    cuda.bindings.nvjitlink.destroy(handle)
+    nvjitlink.destroy(handle)
 
 
 def test_complete_empty():
-    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_90"])
-    cuda.bindings.nvjitlink.complete(handle)
-    cuda.bindings.nvjitlink.destroy(handle)
+    handle = nvjitlink.create(1, ["-arch=sm_90"])
+    nvjitlink.complete(handle)
+    nvjitlink.destroy(handle)
+
 
 def test_add_data():
-    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_90"])
-    data = ptx_bytes
-    cuda.bindings.nvjitlink.add_data(handle, cuda.bindings.nvjitlink.InputType.ANY, data, len(data), "test_data")
-    cuda.bindings.nvjitlink.complete(handle)
-    cuda.bindings.nvjitlink.destroy(handle)
+    handle = nvjitlink.create(1, ["-arch=sm_90"])
+    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_kernel_bytes, len(ptx_kernel_bytes), "test_data")
+    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, minimal_ptx_kernel_bytes, len(minimal_ptx_kernel_bytes), "minimal_test_data")
+    nvjitlink.complete(handle)
+    nvjitlink.destroy(handle)
 
 
 def test_add_file():
-    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_90"])
+    handle = nvjitlink.create(1, ["-arch=sm_90"])
     file_path = "test_file.cubin"
     with open (file_path, "wb") as f:
-        f.write(ptx_bytes)
+        f.write(ptx_kernel_bytes)
 
-    cuda.bindings.nvjitlink.add_file(handle, cuda.bindings.nvjitlink.InputType.ANY, str(file_path))
-    cuda.bindings.nvjitlink.complete(handle)
-    cuda.bindings.nvjitlink.destroy(handle)
-    
+    nvjitlink.add_file(handle, nvjitlink.InputType.ANY, str(file_path))
+    nvjitlink.complete(handle)
+    nvjitlink.destroy(handle)
     os.remove(file_path)
 
 
 def test_get_error_log():
-    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_90"])
-    cuda.bindings.nvjitlink.complete(handle)
-    log_size = cuda.bindings.nvjitlink.get_error_log_size(handle)
+    handle = nvjitlink.create(1, ["-arch=sm_90"])
+    nvjitlink.complete(handle)
+    log_size = nvjitlink.get_error_log_size(handle)
     log = bytearray(log_size)
-    cuda.bindings.nvjitlink.get_error_log(handle, log)
+    nvjitlink.get_error_log(handle, log)
     assert len(log) == log_size
-    cuda.bindings.nvjitlink.destroy(handle)
+    nvjitlink.destroy(handle)
 
 
 def test_get_info_log():
-    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_90"])
-    cuda.bindings.nvjitlink.complete(handle)
-    log_size = cuda.bindings.nvjitlink.get_info_log_size(handle)
+    handle = nvjitlink.create(1, ["-arch=sm_90"])
+    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_kernel_bytes, len(ptx_kernel_bytes), "test_data")
+    nvjitlink.complete(handle)
+    log_size = nvjitlink.get_info_log_size(handle)
     log = bytearray(log_size)
-    cuda.bindings.nvjitlink.get_info_log(handle, log)
+    nvjitlink.get_info_log(handle, log)
     assert len(log) == log_size
-    cuda.bindings.nvjitlink.destroy(handle)
+    nvjitlink.destroy(handle)
 
 
 def test_get_linked_cubin():
-    handle = cuda.bindings.nvjitlink.create(1, ["-arch=sm_90"])
-    cuda.bindings.nvjitlink.complete(handle)
-    cubin_size = cuda.bindings.nvjitlink.get_linked_cubin_size(handle)
+    handle = nvjitlink.create(1, ["-arch=sm_90"])
+    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_kernel_bytes, len(ptx_kernel_bytes), "test_data")
+    nvjitlink.complete(handle)
+    cubin_size = nvjitlink.get_linked_cubin_size(handle)
     cubin = bytearray(cubin_size)
-    cuda.bindings.nvjitlink.get_linked_cubin(handle, cubin)
+    nvjitlink.get_linked_cubin(handle, cubin)
     assert len(cubin) == cubin_size
-    cuda.bindings.nvjitlink.destroy(handle)
+    nvjitlink.destroy(handle)
+
+
+def test_get_linked_ptx():
+    # TODO improve this test to call get_linked_ptx without this error
+    handle = nvjitlink.create(2, ["-arch=sm_90", "-lto"])
+    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_NVVM_COMPILE"):
+        nvjitlink.complete(handle)
 
-#TODO add a ptx test
 
 def test_package_version():
-    ver = cuda.bindings.nvjitlink.version()
+    ver = nvjitlink.version()
     assert len(ver) == 2
-    assert ver >= (12, 0)
\ No newline at end of file
+    assert ver >= (12, 0)

From b1536f3cebcadc44ec99d8a5a9f49ae9ed1a239e Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 22 Oct 2024 10:10:51 -0700
Subject: [PATCH 26/34] regenerate bindings

---
 .../bindings/_internal/nvjitlink_linux.pyx    |   2 +-
 .../bindings/_internal/nvjitlink_windows.pyx  |   2 +-
 .../cuda/bindings/_internal/utils.pxd         |   4 +
 .../cuda/bindings/_internal/utils.pyx         |   8 +-
 cuda_bindings/cuda/bindings/nvjitlink.pxd     |   8 +-
 cuda_bindings/cuda/bindings/nvjitlink.pyx     | 146 ++----------------
 6 files changed, 29 insertions(+), 141 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index 146832f0e..eb882b4fb 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
 # This code was automatically generated across versions from 12.0.1 to 12.6.2. Do not modify it directly.
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index a6a378f86..e50de88af 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
 # This code was automatically generated across versions from 12.0.1 to 12.6.2. Do not modify it directly.
 
diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pxd b/cuda_bindings/cuda/bindings/_internal/utils.pxd
index 2b45ced3b..d629179dc 100644
--- a/cuda_bindings/cuda/bindings/_internal/utils.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pxd
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from libc.stdint cimport int32_t, int64_t, intptr_t
 from libcpp.vector cimport vector
 from libcpp cimport bool as cppbool
diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pyx b/cuda_bindings/cuda/bindings/_internal/utils.pyx
index 9c5626155..55945ec96 100644
--- a/cuda_bindings/cuda/bindings/_internal/utils.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pyx
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 cimport cpython
 from libc.stdint cimport intptr_t
 from libcpp.utility cimport move
@@ -29,7 +33,9 @@ cdef void* get_buffer_pointer(buf, Py_ssize_t size, readonly=True) except*:
     else:  # try buffer protocol
         try:
             status = cpython.PyObject_GetBuffer(buf, &view, flags)
-            assert view.len == size
+            # when the caller does not provide a size, it is set to -1 at generate-time by cybind
+            if size != -1:
+                assert view.len == size
             assert view.ndim == 1
         except Exception as e:
             adj = "writable " if not readonly else ""
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pxd b/cuda_bindings/cuda/bindings/nvjitlink.pxd
index 2b8841cd5..59b56d2a3 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pxd
@@ -33,11 +33,11 @@ cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, name
 cpdef add_file(intptr_t handle, int input_type, file_name)
 cpdef complete(intptr_t handle)
 cpdef size_t get_linked_cubin_size(intptr_t handle) except? 0
-cpdef get_linked_cubin(intptr_t handle, intptr_t cubin)
+cpdef get_linked_cubin(intptr_t handle, cubin)
 cpdef size_t get_linked_ptx_size(intptr_t handle) except? 0
-cpdef get_linked_ptx(intptr_t handle, intptr_t ptx)
+cpdef get_linked_ptx(intptr_t handle, ptx)
 cpdef size_t get_error_log_size(intptr_t handle) except? 0
-cpdef get_error_log(intptr_t handle, intptr_t log)
+cpdef get_error_log(intptr_t handle, log)
 cpdef size_t get_info_log_size(intptr_t handle) except? 0
-cpdef get_info_log(intptr_t handle, intptr_t log)
+cpdef get_info_log(intptr_t handle, log)
 cpdef tuple version()
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index 6cc67d7e8..8237ea14c 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -86,23 +86,6 @@ cpdef destroy(intptr_t handle):
 
 
 cpdef intptr_t create(uint32_t num_options, options) except -1:
-    """nvJitLinkCreate creates an instance of nvJitLinkHandle with the given input options, and sets the output parameter ``handle``.
-
-    Args:
-        num_options (uint32_t): Number of options passed.
-        options (object): Array of size ``num_options`` of option strings. It can be:
-
-            - an :class:`int` as the pointer address to the nested sequence, or
-            - a Python sequence of :class:`int`\s, each of which is a pointer address
-              to a valid sequence of 'char', or
-            - a nested Python sequence of ``str``.
-
-
-    Returns:
-        intptr_t: Address of nvJitLink handle.
-
-    .. seealso:: `nvJitLinkCreate`
-    """
     cdef nested_resource[ char ] _options_
     get_nested_resource_ptr[char](_options_, options, <char*>NULL)
     cdef Handle handle
@@ -113,17 +96,6 @@ cpdef intptr_t create(uint32_t num_options, options) except -1:
 
 
 cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, name):
-    """nvJitLinkAddData adds data image to the link.
-
-    Args:
-        handle (intptr_t): nvJitLink handle.
-        input_type (InputType): kind of input.
-        data (intptr_t): pointer to data image in memory.
-        size (size_t): size of the data.
-        name (str): name of input object.
-
-    .. seealso:: `nvJitLinkAddData`
-    """
     if not isinstance(name, str):
         raise TypeError("name must be a Python str")
     cdef bytes _temp_name_ = (<str>name).encode()
@@ -134,15 +106,6 @@ cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, name
 
 
 cpdef add_file(intptr_t handle, int input_type, file_name):
-    """nvJitLinkAddFile reads data from file and links it in.
-
-    Args:
-        handle (intptr_t): nvJitLink handle.
-        input_type (InputType): kind of input.
-        file_name (str): name of file.
-
-    .. seealso:: `nvJitLinkAddFile`
-    """
     if not isinstance(file_name, str):
         raise TypeError("file_name must be a Python str")
     cdef bytes _temp_file_name_ = (<str>file_name).encode()
@@ -153,29 +116,12 @@ cpdef add_file(intptr_t handle, int input_type, file_name):
 
 
 cpdef complete(intptr_t handle):
-    """nvJitLinkComplete does the actual link.
-
-    Args:
-        handle (intptr_t): nvJitLink handle.
-
-    .. seealso:: `nvJitLinkComplete`
-    """
     with nogil:
         status = nvJitLinkComplete(<Handle>handle)
     check_status(status)
 
 
 cpdef size_t get_linked_cubin_size(intptr_t handle) except? 0:
-    """nvJitLinkGetLinkedCubinSize gets the size of the linked cubin.
-
-    Args:
-        handle (intptr_t): nvJitLink handle.
-
-    Returns:
-        size_t: Size of the linked cubin.
-
-    .. seealso:: `nvJitLinkGetLinkedCubinSize`
-    """
     cdef size_t size
     with nogil:
         status = nvJitLinkGetLinkedCubinSize(<Handle>handle, &size)
@@ -183,31 +129,14 @@ cpdef size_t get_linked_cubin_size(intptr_t handle) except? 0:
     return size
 
 
-cpdef get_linked_cubin(intptr_t handle, intptr_t cubin):
-    """nvJitLinkGetLinkedCubin gets the linked cubin.
-
-    Args:
-        handle (intptr_t): nvJitLink handle.
-        cubin (intptr_t): The linked cubin.
-
-    .. seealso:: `nvJitLinkGetLinkedCubin`
-    """
+cpdef get_linked_cubin(intptr_t handle, cubin):
+    cdef void* _cubin_ = get_buffer_pointer(cubin, -1, readonly=False)
     with nogil:
-        status = nvJitLinkGetLinkedCubin(<Handle>handle, <void*>cubin)
+        status = nvJitLinkGetLinkedCubin(<Handle>handle, <void*>_cubin_)
     check_status(status)
 
 
 cpdef size_t get_linked_ptx_size(intptr_t handle) except? 0:
-    """nvJitLinkGetLinkedPtxSize gets the size of the linked ptx.
-
-    Args:
-        handle (intptr_t): nvJitLink handle.
-
-    Returns:
-        size_t: Size of the linked PTX.
-
-    .. seealso:: `nvJitLinkGetLinkedPtxSize`
-    """
     cdef size_t size
     with nogil:
         status = nvJitLinkGetLinkedPtxSize(<Handle>handle, &size)
@@ -215,31 +144,14 @@ cpdef size_t get_linked_ptx_size(intptr_t handle) except? 0:
     return size
 
 
-cpdef get_linked_ptx(intptr_t handle, intptr_t ptx):
-    """nvJitLinkGetLinkedPtx gets the linked ptx.
-
-    Args:
-        handle (intptr_t): nvJitLink handle.
-        ptx (intptr_t): The linked PTX.
-
-    .. seealso:: `nvJitLinkGetLinkedPtx`
-    """
+cpdef get_linked_ptx(intptr_t handle, ptx):
+    cdef void* _ptx_ = get_buffer_pointer(ptx, -1, readonly=False)
     with nogil:
-        status = nvJitLinkGetLinkedPtx(<Handle>handle, <char*>ptx)
+        status = nvJitLinkGetLinkedPtx(<Handle>handle, <char*>_ptx_)
     check_status(status)
 
 
 cpdef size_t get_error_log_size(intptr_t handle) except? 0:
-    """nvJitLinkGetErrorLogSize gets the size of the error log.
-
-    Args:
-        handle (intptr_t): nvJitLink handle.
-
-    Returns:
-        size_t: Size of the error log.
-
-    .. seealso:: `nvJitLinkGetErrorLogSize`
-    """
     cdef size_t size
     with nogil:
         status = nvJitLinkGetErrorLogSize(<Handle>handle, &size)
@@ -247,31 +159,14 @@ cpdef size_t get_error_log_size(intptr_t handle) except? 0:
     return size
 
 
-cpdef get_error_log(intptr_t handle, intptr_t log):
-    """nvJitLinkGetErrorLog puts any error messages in the log.
-
-    Args:
-        handle (intptr_t): nvJitLink handle.
-        log (intptr_t): The error log.
-
-    .. seealso:: `nvJitLinkGetErrorLog`
-    """
+cpdef get_error_log(intptr_t handle, log):
+    cdef void* _log_ = get_buffer_pointer(log, -1, readonly=False)
     with nogil:
-        status = nvJitLinkGetErrorLog(<Handle>handle, <char*>log)
+        status = nvJitLinkGetErrorLog(<Handle>handle, <char*>_log_)
     check_status(status)
 
 
 cpdef size_t get_info_log_size(intptr_t handle) except? 0:
-    """nvJitLinkGetInfoLogSize gets the size of the info log.
-
-    Args:
-        handle (intptr_t): nvJitLink handle.
-
-    Returns:
-        size_t: Size of the info log.
-
-    .. seealso:: `nvJitLinkGetInfoLogSize`
-    """
     cdef size_t size
     with nogil:
         status = nvJitLinkGetInfoLogSize(<Handle>handle, &size)
@@ -279,31 +174,14 @@ cpdef size_t get_info_log_size(intptr_t handle) except? 0:
     return size
 
 
-cpdef get_info_log(intptr_t handle, intptr_t log):
-    """nvJitLinkGetInfoLog puts any info messages in the log.
-
-    Args:
-        handle (intptr_t): nvJitLink handle.
-        log (intptr_t): The info log.
-
-    .. seealso:: `nvJitLinkGetInfoLog`
-    """
+cpdef get_info_log(intptr_t handle, log):
+    cdef void* _log_ = get_buffer_pointer(log, -1, readonly=False)
     with nogil:
-        status = nvJitLinkGetInfoLog(<Handle>handle, <char*>log)
+        status = nvJitLinkGetInfoLog(<Handle>handle, <char*>_log_)
     check_status(status)
 
 
 cpdef tuple version():
-    """nvJitLinkVersion returns the current version of nvJitLink.
-
-    Returns:
-        A 2-tuple containing:
-
-        - unsigned int: The major version.
-        - unsigned int: The minor version.
-
-    .. seealso:: `nvJitLinkVersion`
-    """
     cdef unsigned int major
     cdef unsigned int minor
     with nogil:

From 992ddcf6d543af37c1798bd18f32f8687451f4dc Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 22 Oct 2024 10:26:38 -0700
Subject: [PATCH 27/34] regenerate bindings

---
 cuda_bindings/cuda/bindings/nvjitlink.pxd | 2 +-
 cuda_bindings/cuda/bindings/nvjitlink.pyx | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pxd b/cuda_bindings/cuda/bindings/nvjitlink.pxd
index 59b56d2a3..4f701ed4d 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pxd
@@ -29,7 +29,7 @@ ctypedef nvJitLinkInputType _InputType
 ###############################################################################
 
 cpdef intptr_t create(uint32_t num_options, options) except -1
-cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, name)
+cpdef add_data(intptr_t handle, int input_type, data, size_t size, name)
 cpdef add_file(intptr_t handle, int input_type, file_name)
 cpdef complete(intptr_t handle)
 cpdef size_t get_linked_cubin_size(intptr_t handle) except? 0
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index 8237ea14c..7ffb16d9a 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -95,13 +95,14 @@ cpdef intptr_t create(uint32_t num_options, options) except -1:
     return <intptr_t>handle
 
 
-cpdef add_data(intptr_t handle, int input_type, intptr_t data, size_t size, name):
+cpdef add_data(intptr_t handle, int input_type, data, size_t size, name):
+    cdef void* _data_ = get_buffer_pointer(data, size, readonly=True)
     if not isinstance(name, str):
         raise TypeError("name must be a Python str")
     cdef bytes _temp_name_ = (<str>name).encode()
     cdef char* _name_ = _temp_name_
     with nogil:
-        status = nvJitLinkAddData(<Handle>handle, <_InputType>input_type, <const void*>data, size, <const char*>_name_)
+        status = nvJitLinkAddData(<Handle>handle, <_InputType>input_type, <const void*>_data_, size, <const char*>_name_)
     check_status(status)
 
 

From b5c5c1c5b769fa60cb34980f68db939dc08cd90a Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 22 Oct 2024 10:54:18 -0700
Subject: [PATCH 28/34] regenerate with docstrings

---
 cuda_bindings/cuda/bindings/nvjitlink.pyx | 126 ++++++++++++++++++++++
 1 file changed, 126 insertions(+)

diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index 7ffb16d9a..01a12528e 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -86,6 +86,23 @@ cpdef destroy(intptr_t handle):
 
 
 cpdef intptr_t create(uint32_t num_options, options) except -1:
+    """nvJitLinkCreate creates an instance of nvJitLinkHandle with the given input options, and sets the output parameter ``handle``.
+
+    Args:
+        num_options (uint32_t): Number of options passed.
+        options (object): Array of size ``num_options`` of option strings. It can be:
+
+            - an :class:`int` as the pointer address to the nested sequence, or
+            - a Python sequence of :class:`int`\s, each of which is a pointer address
+              to a valid sequence of 'char', or
+            - a nested Python sequence of ``str``.
+
+
+    Returns:
+        intptr_t: Address of nvJitLink handle.
+
+    .. seealso:: `nvJitLinkCreate`
+    """
     cdef nested_resource[ char ] _options_
     get_nested_resource_ptr[char](_options_, options, <char*>NULL)
     cdef Handle handle
@@ -96,6 +113,17 @@ cpdef intptr_t create(uint32_t num_options, options) except -1:
 
 
 cpdef add_data(intptr_t handle, int input_type, data, size_t size, name):
+    """nvJitLinkAddData adds data image to the link.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        input_type (InputType): kind of input.
+        data (bytes): pointer to data image in memory.
+        size (size_t): size of the data.
+        name (str): name of input object.
+
+    .. seealso:: `nvJitLinkAddData`
+    """
     cdef void* _data_ = get_buffer_pointer(data, size, readonly=True)
     if not isinstance(name, str):
         raise TypeError("name must be a Python str")
@@ -107,6 +135,15 @@ cpdef add_data(intptr_t handle, int input_type, data, size_t size, name):
 
 
 cpdef add_file(intptr_t handle, int input_type, file_name):
+    """nvJitLinkAddFile reads data from file and links it in.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        input_type (InputType): kind of input.
+        file_name (str): name of file.
+
+    .. seealso:: `nvJitLinkAddFile`
+    """
     if not isinstance(file_name, str):
         raise TypeError("file_name must be a Python str")
     cdef bytes _temp_file_name_ = (<str>file_name).encode()
@@ -117,12 +154,29 @@ cpdef add_file(intptr_t handle, int input_type, file_name):
 
 
 cpdef complete(intptr_t handle):
+    """nvJitLinkComplete does the actual link.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+
+    .. seealso:: `nvJitLinkComplete`
+    """
     with nogil:
         status = nvJitLinkComplete(<Handle>handle)
     check_status(status)
 
 
 cpdef size_t get_linked_cubin_size(intptr_t handle) except? 0:
+    """nvJitLinkGetLinkedCubinSize gets the size of the linked cubin.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+
+    Returns:
+        size_t: Size of the linked cubin.
+
+    .. seealso:: `nvJitLinkGetLinkedCubinSize`
+    """
     cdef size_t size
     with nogil:
         status = nvJitLinkGetLinkedCubinSize(<Handle>handle, &size)
@@ -131,6 +185,14 @@ cpdef size_t get_linked_cubin_size(intptr_t handle) except? 0:
 
 
 cpdef get_linked_cubin(intptr_t handle, cubin):
+    """nvJitLinkGetLinkedCubin gets the linked cubin.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        cubin (bytes): The linked cubin.
+
+    .. seealso:: `nvJitLinkGetLinkedCubin`
+    """
     cdef void* _cubin_ = get_buffer_pointer(cubin, -1, readonly=False)
     with nogil:
         status = nvJitLinkGetLinkedCubin(<Handle>handle, <void*>_cubin_)
@@ -138,6 +200,16 @@ cpdef get_linked_cubin(intptr_t handle, cubin):
 
 
 cpdef size_t get_linked_ptx_size(intptr_t handle) except? 0:
+    """nvJitLinkGetLinkedPtxSize gets the size of the linked ptx.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+
+    Returns:
+        size_t: Size of the linked PTX.
+
+    .. seealso:: `nvJitLinkGetLinkedPtxSize`
+    """
     cdef size_t size
     with nogil:
         status = nvJitLinkGetLinkedPtxSize(<Handle>handle, &size)
@@ -146,6 +218,14 @@ cpdef size_t get_linked_ptx_size(intptr_t handle) except? 0:
 
 
 cpdef get_linked_ptx(intptr_t handle, ptx):
+    """nvJitLinkGetLinkedPtx gets the linked ptx.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        ptx (bytes): The linked PTX.
+
+    .. seealso:: `nvJitLinkGetLinkedPtx`
+    """
     cdef void* _ptx_ = get_buffer_pointer(ptx, -1, readonly=False)
     with nogil:
         status = nvJitLinkGetLinkedPtx(<Handle>handle, <char*>_ptx_)
@@ -153,6 +233,16 @@ cpdef get_linked_ptx(intptr_t handle, ptx):
 
 
 cpdef size_t get_error_log_size(intptr_t handle) except? 0:
+    """nvJitLinkGetErrorLogSize gets the size of the error log.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+
+    Returns:
+        size_t: Size of the error log.
+
+    .. seealso:: `nvJitLinkGetErrorLogSize`
+    """
     cdef size_t size
     with nogil:
         status = nvJitLinkGetErrorLogSize(<Handle>handle, &size)
@@ -161,6 +251,14 @@ cpdef size_t get_error_log_size(intptr_t handle) except? 0:
 
 
 cpdef get_error_log(intptr_t handle, log):
+    """nvJitLinkGetErrorLog puts any error messages in the log.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        log (bytes): The error log.
+
+    .. seealso:: `nvJitLinkGetErrorLog`
+    """
     cdef void* _log_ = get_buffer_pointer(log, -1, readonly=False)
     with nogil:
         status = nvJitLinkGetErrorLog(<Handle>handle, <char*>_log_)
@@ -168,6 +266,16 @@ cpdef get_error_log(intptr_t handle, log):
 
 
 cpdef size_t get_info_log_size(intptr_t handle) except? 0:
+    """nvJitLinkGetInfoLogSize gets the size of the info log.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+
+    Returns:
+        size_t: Size of the info log.
+
+    .. seealso:: `nvJitLinkGetInfoLogSize`
+    """
     cdef size_t size
     with nogil:
         status = nvJitLinkGetInfoLogSize(<Handle>handle, &size)
@@ -176,6 +284,14 @@ cpdef size_t get_info_log_size(intptr_t handle) except? 0:
 
 
 cpdef get_info_log(intptr_t handle, log):
+    """nvJitLinkGetInfoLog puts any info messages in the log.
+
+    Args:
+        handle (intptr_t): nvJitLink handle.
+        log (bytes): The info log.
+
+    .. seealso:: `nvJitLinkGetInfoLog`
+    """
     cdef void* _log_ = get_buffer_pointer(log, -1, readonly=False)
     with nogil:
         status = nvJitLinkGetInfoLog(<Handle>handle, <char*>_log_)
@@ -183,6 +299,16 @@ cpdef get_info_log(intptr_t handle, log):
 
 
 cpdef tuple version():
+    """nvJitLinkVersion returns the current version of nvJitLink.
+
+    Returns:
+        A 2-tuple containing:
+
+        - unsigned int: The major version.
+        - unsigned int: The minor version.
+
+    .. seealso:: `nvJitLinkVersion`
+    """
     cdef unsigned int major
     cdef unsigned int minor
     with nogil:

From 8ee6aa2c16561312de13dfe91e7b6cfd259a1f0e Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 22 Oct 2024 11:17:06 -0700
Subject: [PATCH 29/34] regenerate bindings with windows imports

---
 cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index e50de88af..586296ab4 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -8,6 +8,7 @@ from libc.stdint cimport intptr_t
 
 from .utils cimport get_nvjitlink_dso_version_suffix
 
+from .utils import FunctionNotFoundError, NotSupportedError
 
 import os
 import site

From f276cd67256f126ed239f5d64cc64ec549cafa47 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 23 Oct 2024 01:20:15 +0000
Subject: [PATCH 30/34] use tmp_path fixture

---
 cuda_bindings/tests/test_nvjitlink.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py
index 6524c4a88..a08377989 100644
--- a/cuda_bindings/tests/test_nvjitlink.py
+++ b/cuda_bindings/tests/test_nvjitlink.py
@@ -7,7 +7,7 @@
 # is strictly prohibited.
 
 import pytest
-import os
+
 from cuda.bindings import nvjitlink
 
 
@@ -76,16 +76,13 @@ def test_add_data():
     nvjitlink.destroy(handle)
 
 
-def test_add_file():
+def test_add_file(tmp_path):
     handle = nvjitlink.create(1, ["-arch=sm_90"])
-    file_path = "test_file.cubin"
-    with open (file_path, "wb") as f:
-        f.write(ptx_kernel_bytes)
-
+    file_path = tmp_path / "test_file.cubin"
+    file_path.write_bytes(ptx_kernel_bytes)
     nvjitlink.add_file(handle, nvjitlink.InputType.ANY, str(file_path))
     nvjitlink.complete(handle)
     nvjitlink.destroy(handle)
-    os.remove(file_path)
 
 
 def test_get_error_log():

From fcdae67288d82a25aa73e4e7046c1e75eafb16b2 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 22 Oct 2024 21:24:07 -0400
Subject: [PATCH 31/34] fix license header in the test file

---
 cuda_bindings/tests/test_nvjitlink.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py
index a08377989..182d2bc40 100644
--- a/cuda_bindings/tests/test_nvjitlink.py
+++ b/cuda_bindings/tests/test_nvjitlink.py
@@ -1,10 +1,6 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import pytest
 

From f861f80c23245b997204476b7de8ac6201693e86 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 22 Oct 2024 18:54:42 -0700
Subject: [PATCH 32/34] fix nvjitlink dll name

---
 cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index 586296ab4..b8ab705d8 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -52,7 +52,7 @@ cdef load_library(const int driver_ver):
     for suffix in get_nvjitlink_dso_version_suffix(driver_ver):
         if len(suffix) == 0:
             continue
-        dll_name = f"nvJitLink64_{suffix}.dll"
+        dll_name = f"nvJitLink_{suffix}0_0.dll"
 
         # First check if the DLL has been loaded by 3rd parties
         try:

From 375c33b30a4253bebe6cee2048f691029797cf55 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 25 Oct 2024 09:12:06 -0700
Subject: [PATCH 33/34] fix typo in setup.py

---
 cuda_bindings/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 9d0f6fad3..316559859 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -272,7 +272,7 @@ def do_cythonize(extensions):
     # tests
     ["tests/*.pyx"],
 
-    # interal files used by cybind. We on
+    # interal files used by cybind generated bindings
     ['cuda/bindings/_internal/nvjitlink.pyx'],
     ['cuda/bindings/_internal/utils.pyx'],
 ]

From f7704a93d7fe606fe6d9ff15a7ae9d8d0dcc4271 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 25 Oct 2024 12:19:15 -0400
Subject: [PATCH 34/34] Update cuda_bindings/setup.py

---
 cuda_bindings/setup.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 316559859..ca1f82648 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -271,8 +271,7 @@ def do_cythonize(extensions):
     ["cuda/*.pyx"],
     # tests
     ["tests/*.pyx"],
-
-    # interal files used by cybind generated bindings
+    # interal files used by generated bindings
     ['cuda/bindings/_internal/nvjitlink.pyx'],
     ['cuda/bindings/_internal/utils.pyx'],
 ]