diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index baab50a81..43a018880 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -17,7 +17,7 @@ inputs:
     description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
     required: false
     type: string
-    default: "cuda_nvcc,cuda_cudart,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink"
+    default: "cuda_nvcc,cuda_cudart,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
 
 runs:
   using: composite
@@ -25,10 +25,29 @@ runs:
     - name: Set up CTK cache variable
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        HASH=$(echo -n "${{ inputs.cuda-components }}" | sha256sum | awk '{print $1}')
+        # Pre-process the component list to ensure hash uniqueness
+        CTK_CACHE_COMPONENTS=${{ inputs.cuda-components }}
+        # Conditionally strip out libnvjitlink for CUDA versions < 12
+        CUDA_MAJOR_VER="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
+        if [[ "$CUDA_MAJOR_VER" -lt 12 ]]; then
+          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libnvjitlink/}"
+        fi
+        # Conditionally strip out libcufile since it does not support Windows
+        if [[ "${{ inputs.host-platform }}" == win-* ]]; then
+          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libcufile/}"
+        fi
+        # Conditionally strip out libcufile for CUDA versions < 12.2.0 + aarch64 (redist not available)
+        CUDA_MINOR_VER="$(cut -d '.' -f 2 <<< ${{ inputs.cuda-version }})"
+        if [[ ("$CUDA_MAJOR_VER" -lt 12 || "$CUDA_MINOR_VER" -lt 2) && "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then
+          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libcufile/}"
+        fi
+        # Cleanup stray commas after removing components
+        CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//,,/,}"
+
+        HASH=$(echo -n "${CTK_CACHE_COMPONENTS}" | sha256sum | awk '{print $1}')
         echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}-$HASH" >> $GITHUB_ENV
         echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}-$HASH.tar.gz" >> $GITHUB_ENV
-        echo "CTK_CACHE_COMPONENTS=${{ inputs.cuda-components }}" >> $GITHUB_ENV
+        echo "CTK_CACHE_COMPONENTS=${CTK_CACHE_COMPONENTS}" >> $GITHUB_ENV
 
     - name: Install dependencies
       uses: ./.github/actions/install_unix_deps
@@ -94,12 +113,6 @@ runs:
           rm $CTK_COMPONENT_COMPONENT_FILENAME
         }
 
-        # Conditionally strip out libnvjitlink for CUDA versions < 12
-        if [[ "$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})" -lt 12 ]]; then
-          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libnvjitlink/}"
-        fi
-        # Cleanup stray commas after removing components
-        CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//,,/,}"
         # Get headers and shared libraries in place
         for item in $(echo $CTK_CACHE_COMPONENTS | tr ',' ' '); do
             populate_cuda_path "$item"
diff --git a/cuda_bindings/cuda/bindings/_internal/cufile.pxd b/cuda_bindings/cuda/bindings/_internal/cufile.pxd
new file mode 100644
index 000000000..9cccb9fee
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/cufile.pxd
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 12.9.0. Do not modify it directly.
+
+from ..cycufile cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef CUfileError_t _cuFileHandleRegister(CUfileHandle_t* fh, CUfileDescr_t* descr) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef void _cuFileHandleDeregister(CUfileHandle_t fh) except* nogil
+cdef CUfileError_t _cuFileBufRegister(const void* bufPtr_base, size_t length, int flags) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileBufDeregister(const void* bufPtr_base) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef ssize_t _cuFileRead(CUfileHandle_t fh, void* bufPtr_base, size_t size, off_t file_offset, off_t bufPtr_offset) except* nogil
+cdef ssize_t _cuFileWrite(CUfileHandle_t fh, const void* bufPtr_base, size_t size, off_t file_offset, off_t bufPtr_offset) except* nogil
+cdef CUfileError_t _cuFileDriverOpen() except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileDriverClose_v2() except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef long _cuFileUseCount() except* nogil
+cdef CUfileError_t _cuFileDriverGetProperties(CUfileDrvProps_t* props) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileDriverSetPollMode(cpp_bool poll, size_t poll_threshold_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileDriverSetMaxDirectIOSize(size_t max_direct_io_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileDriverSetMaxCacheSize(size_t max_cache_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileDriverSetMaxPinnedMemSize(size_t max_pinned_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileBatchIOSetUp(CUfileBatchHandle_t* batch_idp, unsigned nr) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileBatchIOSubmit(CUfileBatchHandle_t batch_idp, unsigned nr, CUfileIOParams_t* iocbp, unsigned int flags) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileBatchIOGetStatus(CUfileBatchHandle_t batch_idp, unsigned min_nr, unsigned* nr, CUfileIOEvents_t* iocbp, timespec* timeout) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileBatchIOCancel(CUfileBatchHandle_t batch_idp) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef void _cuFileBatchIODestroy(CUfileBatchHandle_t batch_idp) except* nogil
+cdef CUfileError_t _cuFileReadAsync(CUfileHandle_t fh, void* bufPtr_base, size_t* size_p, off_t* file_offset_p, off_t* bufPtr_offset_p, ssize_t* bytes_read_p, CUstream stream) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileWriteAsync(CUfileHandle_t fh, void* bufPtr_base, size_t* size_p, off_t* file_offset_p, off_t* bufPtr_offset_p, ssize_t* bytes_written_p, CUstream stream) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileStreamRegister(CUstream stream, unsigned flags) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileStreamDeregister(CUstream stream) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileGetVersion(int* version) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileGetParameterSizeT(CUFileSizeTConfigParameter_t param, size_t* value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileGetParameterBool(CUFileBoolConfigParameter_t param, cpp_bool* value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileGetParameterString(CUFileStringConfigParameter_t param, char* desc_str, int len) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileSetParameterSizeT(CUFileSizeTConfigParameter_t param, size_t value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileSetParameterBool(CUFileBoolConfigParameter_t param, cpp_bool value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t _cuFileSetParameterString(CUFileStringConfigParameter_t param, const char* desc_str) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
diff --git a/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx b/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx
new file mode 100644
index 000000000..66cb24ea7
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx
@@ -0,0 +1,734 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 12.9.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t, uintptr_t
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+from cuda.bindings import path_finder
+
+import cython
+
+###############################################################################
+# Extern
+###############################################################################
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef bint __py_cufile_init = False
+cdef void* __cuDriverGetVersion = NULL
+
+cdef void* __cuFileHandleRegister = NULL
+cdef void* __cuFileHandleDeregister = NULL
+cdef void* __cuFileBufRegister = NULL
+cdef void* __cuFileBufDeregister = NULL
+cdef void* __cuFileRead = NULL
+cdef void* __cuFileWrite = NULL
+cdef void* __cuFileDriverOpen = NULL
+cdef void* __cuFileDriverClose_v2 = NULL
+cdef void* __cuFileUseCount = NULL
+cdef void* __cuFileDriverGetProperties = NULL
+cdef void* __cuFileDriverSetPollMode = NULL
+cdef void* __cuFileDriverSetMaxDirectIOSize = NULL
+cdef void* __cuFileDriverSetMaxCacheSize = NULL
+cdef void* __cuFileDriverSetMaxPinnedMemSize = NULL
+cdef void* __cuFileBatchIOSetUp = NULL
+cdef void* __cuFileBatchIOSubmit = NULL
+cdef void* __cuFileBatchIOGetStatus = NULL
+cdef void* __cuFileBatchIOCancel = NULL
+cdef void* __cuFileBatchIODestroy = NULL
+cdef void* __cuFileReadAsync = NULL
+cdef void* __cuFileWriteAsync = NULL
+cdef void* __cuFileStreamRegister = NULL
+cdef void* __cuFileStreamDeregister = NULL
+cdef void* __cuFileGetVersion = NULL
+cdef void* __cuFileGetParameterSizeT = NULL
+cdef void* __cuFileGetParameterBool = NULL
+cdef void* __cuFileGetParameterString = NULL
+cdef void* __cuFileSetParameterSizeT = NULL
+cdef void* __cuFileSetParameterBool = NULL
+cdef void* __cuFileSetParameterString = NULL
+
+
+cdef void* load_library(const int driver_ver) except* with gil:
+    cdef uintptr_t handle = path_finder._load_nvidia_dynamic_library("cufile").handle
+    return <void*>handle
+
+
+cdef int _check_or_init_cufile() except -1 nogil:
+    global __py_cufile_init
+    if __py_cufile_init:
+        return 0
+
+    # Load driver to check version
+    cdef void* handle = NULL
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        with gil:
+            err_msg = dlerror()
+            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    global __cuDriverGetVersion
+    if __cuDriverGetVersion == NULL:
+        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if __cuDriverGetVersion == NULL:
+        with gil:
+            raise RuntimeError('something went wrong')
+    cdef int err, driver_ver
+    err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        with gil:
+            raise RuntimeError('something went wrong')
+    #dlclose(handle)
+    handle = NULL
+
+    # Load function
+    global __cuFileHandleRegister
+    __cuFileHandleRegister = dlsym(RTLD_DEFAULT, 'cuFileHandleRegister')
+    if __cuFileHandleRegister == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileHandleRegister = dlsym(handle, 'cuFileHandleRegister')
+
+    global __cuFileHandleDeregister
+    __cuFileHandleDeregister = dlsym(RTLD_DEFAULT, 'cuFileHandleDeregister')
+    if __cuFileHandleDeregister == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileHandleDeregister = dlsym(handle, 'cuFileHandleDeregister')
+
+    global __cuFileBufRegister
+    __cuFileBufRegister = dlsym(RTLD_DEFAULT, 'cuFileBufRegister')
+    if __cuFileBufRegister == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileBufRegister = dlsym(handle, 'cuFileBufRegister')
+
+    global __cuFileBufDeregister
+    __cuFileBufDeregister = dlsym(RTLD_DEFAULT, 'cuFileBufDeregister')
+    if __cuFileBufDeregister == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileBufDeregister = dlsym(handle, 'cuFileBufDeregister')
+
+    global __cuFileRead
+    __cuFileRead = dlsym(RTLD_DEFAULT, 'cuFileRead')
+    if __cuFileRead == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileRead = dlsym(handle, 'cuFileRead')
+
+    global __cuFileWrite
+    __cuFileWrite = dlsym(RTLD_DEFAULT, 'cuFileWrite')
+    if __cuFileWrite == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileWrite = dlsym(handle, 'cuFileWrite')
+
+    global __cuFileDriverOpen
+    __cuFileDriverOpen = dlsym(RTLD_DEFAULT, 'cuFileDriverOpen')
+    if __cuFileDriverOpen == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileDriverOpen = dlsym(handle, 'cuFileDriverOpen')
+
+    global __cuFileDriverClose_v2
+    __cuFileDriverClose_v2 = dlsym(RTLD_DEFAULT, 'cuFileDriverClose_v2')
+    if __cuFileDriverClose_v2 == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileDriverClose_v2 = dlsym(handle, 'cuFileDriverClose_v2')
+
+    global __cuFileUseCount
+    __cuFileUseCount = dlsym(RTLD_DEFAULT, 'cuFileUseCount')
+    if __cuFileUseCount == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileUseCount = dlsym(handle, 'cuFileUseCount')
+
+    global __cuFileDriverGetProperties
+    __cuFileDriverGetProperties = dlsym(RTLD_DEFAULT, 'cuFileDriverGetProperties')
+    if __cuFileDriverGetProperties == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileDriverGetProperties = dlsym(handle, 'cuFileDriverGetProperties')
+
+    global __cuFileDriverSetPollMode
+    __cuFileDriverSetPollMode = dlsym(RTLD_DEFAULT, 'cuFileDriverSetPollMode')
+    if __cuFileDriverSetPollMode == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileDriverSetPollMode = dlsym(handle, 'cuFileDriverSetPollMode')
+
+    global __cuFileDriverSetMaxDirectIOSize
+    __cuFileDriverSetMaxDirectIOSize = dlsym(RTLD_DEFAULT, 'cuFileDriverSetMaxDirectIOSize')
+    if __cuFileDriverSetMaxDirectIOSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileDriverSetMaxDirectIOSize = dlsym(handle, 'cuFileDriverSetMaxDirectIOSize')
+
+    global __cuFileDriverSetMaxCacheSize
+    __cuFileDriverSetMaxCacheSize = dlsym(RTLD_DEFAULT, 'cuFileDriverSetMaxCacheSize')
+    if __cuFileDriverSetMaxCacheSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileDriverSetMaxCacheSize = dlsym(handle, 'cuFileDriverSetMaxCacheSize')
+
+    global __cuFileDriverSetMaxPinnedMemSize
+    __cuFileDriverSetMaxPinnedMemSize = dlsym(RTLD_DEFAULT, 'cuFileDriverSetMaxPinnedMemSize')
+    if __cuFileDriverSetMaxPinnedMemSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileDriverSetMaxPinnedMemSize = dlsym(handle, 'cuFileDriverSetMaxPinnedMemSize')
+
+    global __cuFileBatchIOSetUp
+    __cuFileBatchIOSetUp = dlsym(RTLD_DEFAULT, 'cuFileBatchIOSetUp')
+    if __cuFileBatchIOSetUp == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileBatchIOSetUp = dlsym(handle, 'cuFileBatchIOSetUp')
+
+    global __cuFileBatchIOSubmit
+    __cuFileBatchIOSubmit = dlsym(RTLD_DEFAULT, 'cuFileBatchIOSubmit')
+    if __cuFileBatchIOSubmit == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileBatchIOSubmit = dlsym(handle, 'cuFileBatchIOSubmit')
+
+    global __cuFileBatchIOGetStatus
+    __cuFileBatchIOGetStatus = dlsym(RTLD_DEFAULT, 'cuFileBatchIOGetStatus')
+    if __cuFileBatchIOGetStatus == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileBatchIOGetStatus = dlsym(handle, 'cuFileBatchIOGetStatus')
+
+    global __cuFileBatchIOCancel
+    __cuFileBatchIOCancel = dlsym(RTLD_DEFAULT, 'cuFileBatchIOCancel')
+    if __cuFileBatchIOCancel == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileBatchIOCancel = dlsym(handle, 'cuFileBatchIOCancel')
+
+    global __cuFileBatchIODestroy
+    __cuFileBatchIODestroy = dlsym(RTLD_DEFAULT, 'cuFileBatchIODestroy')
+    if __cuFileBatchIODestroy == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileBatchIODestroy = dlsym(handle, 'cuFileBatchIODestroy')
+
+    global __cuFileReadAsync
+    __cuFileReadAsync = dlsym(RTLD_DEFAULT, 'cuFileReadAsync')
+    if __cuFileReadAsync == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileReadAsync = dlsym(handle, 'cuFileReadAsync')
+
+    global __cuFileWriteAsync
+    __cuFileWriteAsync = dlsym(RTLD_DEFAULT, 'cuFileWriteAsync')
+    if __cuFileWriteAsync == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileWriteAsync = dlsym(handle, 'cuFileWriteAsync')
+
+    global __cuFileStreamRegister
+    __cuFileStreamRegister = dlsym(RTLD_DEFAULT, 'cuFileStreamRegister')
+    if __cuFileStreamRegister == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileStreamRegister = dlsym(handle, 'cuFileStreamRegister')
+
+    global __cuFileStreamDeregister
+    __cuFileStreamDeregister = dlsym(RTLD_DEFAULT, 'cuFileStreamDeregister')
+    if __cuFileStreamDeregister == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileStreamDeregister = dlsym(handle, 'cuFileStreamDeregister')
+
+    global __cuFileGetVersion
+    __cuFileGetVersion = dlsym(RTLD_DEFAULT, 'cuFileGetVersion')
+    if __cuFileGetVersion == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileGetVersion = dlsym(handle, 'cuFileGetVersion')
+
+    global __cuFileGetParameterSizeT
+    __cuFileGetParameterSizeT = dlsym(RTLD_DEFAULT, 'cuFileGetParameterSizeT')
+    if __cuFileGetParameterSizeT == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileGetParameterSizeT = dlsym(handle, 'cuFileGetParameterSizeT')
+
+    global __cuFileGetParameterBool
+    __cuFileGetParameterBool = dlsym(RTLD_DEFAULT, 'cuFileGetParameterBool')
+    if __cuFileGetParameterBool == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileGetParameterBool = dlsym(handle, 'cuFileGetParameterBool')
+
+    global __cuFileGetParameterString
+    __cuFileGetParameterString = dlsym(RTLD_DEFAULT, 'cuFileGetParameterString')
+    if __cuFileGetParameterString == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileGetParameterString = dlsym(handle, 'cuFileGetParameterString')
+
+    global __cuFileSetParameterSizeT
+    __cuFileSetParameterSizeT = dlsym(RTLD_DEFAULT, 'cuFileSetParameterSizeT')
+    if __cuFileSetParameterSizeT == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileSetParameterSizeT = dlsym(handle, 'cuFileSetParameterSizeT')
+
+    global __cuFileSetParameterBool
+    __cuFileSetParameterBool = dlsym(RTLD_DEFAULT, 'cuFileSetParameterBool')
+    if __cuFileSetParameterBool == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileSetParameterBool = dlsym(handle, 'cuFileSetParameterBool')
+
+    global __cuFileSetParameterString
+    __cuFileSetParameterString = dlsym(RTLD_DEFAULT, 'cuFileSetParameterString')
+    if __cuFileSetParameterString == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cuFileSetParameterString = dlsym(handle, 'cuFileSetParameterString')
+
+    __py_cufile_init = True
+    return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_cufile()
+    cdef dict data = {}
+
+    global __cuFileHandleRegister
+    data["__cuFileHandleRegister"] = <intptr_t>__cuFileHandleRegister
+
+    global __cuFileHandleDeregister
+    data["__cuFileHandleDeregister"] = <intptr_t>__cuFileHandleDeregister
+
+    global __cuFileBufRegister
+    data["__cuFileBufRegister"] = <intptr_t>__cuFileBufRegister
+
+    global __cuFileBufDeregister
+    data["__cuFileBufDeregister"] = <intptr_t>__cuFileBufDeregister
+
+    global __cuFileRead
+    data["__cuFileRead"] = <intptr_t>__cuFileRead
+
+    global __cuFileWrite
+    data["__cuFileWrite"] = <intptr_t>__cuFileWrite
+
+    global __cuFileDriverOpen
+    data["__cuFileDriverOpen"] = <intptr_t>__cuFileDriverOpen
+
+    global __cuFileDriverClose_v2
+    data["__cuFileDriverClose_v2"] = <intptr_t>__cuFileDriverClose_v2
+
+    global __cuFileUseCount
+    data["__cuFileUseCount"] = <intptr_t>__cuFileUseCount
+
+    global __cuFileDriverGetProperties
+    data["__cuFileDriverGetProperties"] = <intptr_t>__cuFileDriverGetProperties
+
+    global __cuFileDriverSetPollMode
+    data["__cuFileDriverSetPollMode"] = <intptr_t>__cuFileDriverSetPollMode
+
+    global __cuFileDriverSetMaxDirectIOSize
+    data["__cuFileDriverSetMaxDirectIOSize"] = <intptr_t>__cuFileDriverSetMaxDirectIOSize
+
+    global __cuFileDriverSetMaxCacheSize
+    data["__cuFileDriverSetMaxCacheSize"] = <intptr_t>__cuFileDriverSetMaxCacheSize
+
+    global __cuFileDriverSetMaxPinnedMemSize
+    data["__cuFileDriverSetMaxPinnedMemSize"] = <intptr_t>__cuFileDriverSetMaxPinnedMemSize
+
+    global __cuFileBatchIOSetUp
+    data["__cuFileBatchIOSetUp"] = <intptr_t>__cuFileBatchIOSetUp
+
+    global __cuFileBatchIOSubmit
+    data["__cuFileBatchIOSubmit"] = <intptr_t>__cuFileBatchIOSubmit
+
+    global __cuFileBatchIOGetStatus
+    data["__cuFileBatchIOGetStatus"] = <intptr_t>__cuFileBatchIOGetStatus
+
+    global __cuFileBatchIOCancel
+    data["__cuFileBatchIOCancel"] = <intptr_t>__cuFileBatchIOCancel
+
+    global __cuFileBatchIODestroy
+    data["__cuFileBatchIODestroy"] = <intptr_t>__cuFileBatchIODestroy
+
+    global __cuFileReadAsync
+    data["__cuFileReadAsync"] = <intptr_t>__cuFileReadAsync
+
+    global __cuFileWriteAsync
+    data["__cuFileWriteAsync"] = <intptr_t>__cuFileWriteAsync
+
+    global __cuFileStreamRegister
+    data["__cuFileStreamRegister"] = <intptr_t>__cuFileStreamRegister
+
+    global __cuFileStreamDeregister
+    data["__cuFileStreamDeregister"] = <intptr_t>__cuFileStreamDeregister
+
+    global __cuFileGetVersion
+    data["__cuFileGetVersion"] = <intptr_t>__cuFileGetVersion
+
+    global __cuFileGetParameterSizeT
+    data["__cuFileGetParameterSizeT"] = <intptr_t>__cuFileGetParameterSizeT
+
+    global __cuFileGetParameterBool
+    data["__cuFileGetParameterBool"] = <intptr_t>__cuFileGetParameterBool
+
+    global __cuFileGetParameterString
+    data["__cuFileGetParameterString"] = <intptr_t>__cuFileGetParameterString
+
+    global __cuFileSetParameterSizeT
+    data["__cuFileSetParameterSizeT"] = <intptr_t>__cuFileSetParameterSizeT
+
+    global __cuFileSetParameterBool
+    data["__cuFileSetParameterBool"] = <intptr_t>__cuFileSetParameterBool
+
+    global __cuFileSetParameterString
+    data["__cuFileSetParameterString"] = <intptr_t>__cuFileSetParameterString
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef CUfileError_t _cuFileHandleRegister(CUfileHandle_t* fh, CUfileDescr_t* descr) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileHandleRegister
+    _check_or_init_cufile()
+    if __cuFileHandleRegister == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileHandleRegister is not found")
+    return (<CUfileError_t (*)(CUfileHandle_t*, CUfileDescr_t*) noexcept nogil>__cuFileHandleRegister)(
+        fh, descr)
+
+
+@cython.show_performance_hints(False)
+cdef void _cuFileHandleDeregister(CUfileHandle_t fh) except* nogil:
+    global __cuFileHandleDeregister
+    _check_or_init_cufile()
+    if __cuFileHandleDeregister == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileHandleDeregister is not found")
+    (<void (*)(CUfileHandle_t) noexcept nogil>__cuFileHandleDeregister)(
+        fh)
+
+
+cdef CUfileError_t _cuFileBufRegister(const void* bufPtr_base, size_t length, int flags) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileBufRegister
+    _check_or_init_cufile()
+    if __cuFileBufRegister == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileBufRegister is not found")
+    return (<CUfileError_t (*)(const void*, size_t, int) noexcept nogil>__cuFileBufRegister)(
+        bufPtr_base, length, flags)
+
+
+cdef CUfileError_t _cuFileBufDeregister(const void* bufPtr_base) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileBufDeregister
+    _check_or_init_cufile()
+    if __cuFileBufDeregister == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileBufDeregister is not found")
+    return (<CUfileError_t (*)(const void*) noexcept nogil>__cuFileBufDeregister)(
+        bufPtr_base)
+
+
+cdef ssize_t _cuFileRead(CUfileHandle_t fh, void* bufPtr_base, size_t size, off_t file_offset, off_t bufPtr_offset) except* nogil:
+    global __cuFileRead
+    _check_or_init_cufile()
+    if __cuFileRead == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileRead is not found")
+    return (<ssize_t (*)(CUfileHandle_t, void*, size_t, off_t, off_t) noexcept nogil>__cuFileRead)(
+        fh, bufPtr_base, size, file_offset, bufPtr_offset)
+
+
+cdef ssize_t _cuFileWrite(CUfileHandle_t fh, const void* bufPtr_base, size_t size, off_t file_offset, off_t bufPtr_offset) except* nogil:
+    global __cuFileWrite
+    _check_or_init_cufile()
+    if __cuFileWrite == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileWrite is not found")
+    return (<ssize_t (*)(CUfileHandle_t, const void*, size_t, off_t, off_t) noexcept nogil>__cuFileWrite)(
+        fh, bufPtr_base, size, file_offset, bufPtr_offset)
+
+
+cdef CUfileError_t _cuFileDriverOpen() except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileDriverOpen
+    _check_or_init_cufile()
+    if __cuFileDriverOpen == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileDriverOpen is not found")
+    return (<CUfileError_t (*)() noexcept nogil>__cuFileDriverOpen)(
+        )
+
+
+cdef CUfileError_t _cuFileDriverClose_v2() except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileDriverClose_v2
+    _check_or_init_cufile()
+    if __cuFileDriverClose_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileDriverClose_v2 is not found")
+    return (<CUfileError_t (*)() noexcept nogil>__cuFileDriverClose_v2)(
+        )
+
+
+cdef long _cuFileUseCount() except* nogil:
+    global __cuFileUseCount
+    _check_or_init_cufile()
+    if __cuFileUseCount == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileUseCount is not found")
+    return (<long (*)() noexcept nogil>__cuFileUseCount)(
+        )
+
+
+cdef CUfileError_t _cuFileDriverGetProperties(CUfileDrvProps_t* props) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileDriverGetProperties
+    _check_or_init_cufile()
+    if __cuFileDriverGetProperties == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileDriverGetProperties is not found")
+    return (<CUfileError_t (*)(CUfileDrvProps_t*) noexcept nogil>__cuFileDriverGetProperties)(
+        props)
+
+
+cdef CUfileError_t _cuFileDriverSetPollMode(cpp_bool poll, size_t poll_threshold_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileDriverSetPollMode
+    _check_or_init_cufile()
+    if __cuFileDriverSetPollMode == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileDriverSetPollMode is not found")
+    return (<CUfileError_t (*)(cpp_bool, size_t) noexcept nogil>__cuFileDriverSetPollMode)(
+        poll, poll_threshold_size)
+
+
+cdef CUfileError_t _cuFileDriverSetMaxDirectIOSize(size_t max_direct_io_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileDriverSetMaxDirectIOSize
+    _check_or_init_cufile()
+    if __cuFileDriverSetMaxDirectIOSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileDriverSetMaxDirectIOSize is not found")
+    return (<CUfileError_t (*)(size_t) noexcept nogil>__cuFileDriverSetMaxDirectIOSize)(
+        max_direct_io_size)
+
+
+cdef CUfileError_t _cuFileDriverSetMaxCacheSize(size_t max_cache_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileDriverSetMaxCacheSize
+    _check_or_init_cufile()
+    if __cuFileDriverSetMaxCacheSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileDriverSetMaxCacheSize is not found")
+    return (<CUfileError_t (*)(size_t) noexcept nogil>__cuFileDriverSetMaxCacheSize)(
+        max_cache_size)
+
+
+cdef CUfileError_t _cuFileDriverSetMaxPinnedMemSize(size_t max_pinned_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileDriverSetMaxPinnedMemSize
+    _check_or_init_cufile()
+    if __cuFileDriverSetMaxPinnedMemSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileDriverSetMaxPinnedMemSize is not found")
+    return (<CUfileError_t (*)(size_t) noexcept nogil>__cuFileDriverSetMaxPinnedMemSize)(
+        max_pinned_size)
+
+
+cdef CUfileError_t _cuFileBatchIOSetUp(CUfileBatchHandle_t* batch_idp, unsigned nr) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileBatchIOSetUp
+    _check_or_init_cufile()
+    if __cuFileBatchIOSetUp == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileBatchIOSetUp is not found")
+    return (<CUfileError_t (*)(CUfileBatchHandle_t*, unsigned) noexcept nogil>__cuFileBatchIOSetUp)(
+        batch_idp, nr)
+
+
+cdef CUfileError_t _cuFileBatchIOSubmit(CUfileBatchHandle_t batch_idp, unsigned nr, CUfileIOParams_t* iocbp, unsigned int flags) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileBatchIOSubmit
+    _check_or_init_cufile()
+    if __cuFileBatchIOSubmit == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileBatchIOSubmit is not found")
+    return (<CUfileError_t (*)(CUfileBatchHandle_t, unsigned, CUfileIOParams_t*, unsigned int) noexcept nogil>__cuFileBatchIOSubmit)(
+        batch_idp, nr, iocbp, flags)
+
+
+cdef CUfileError_t _cuFileBatchIOGetStatus(CUfileBatchHandle_t batch_idp, unsigned min_nr, unsigned* nr, CUfileIOEvents_t* iocbp, timespec* timeout) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileBatchIOGetStatus
+    _check_or_init_cufile()
+    if __cuFileBatchIOGetStatus == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileBatchIOGetStatus is not found")
+    return (<CUfileError_t (*)(CUfileBatchHandle_t, unsigned, unsigned*, CUfileIOEvents_t*, timespec*) noexcept nogil>__cuFileBatchIOGetStatus)(
+        batch_idp, min_nr, nr, iocbp, timeout)
+
+
+cdef CUfileError_t _cuFileBatchIOCancel(CUfileBatchHandle_t batch_idp) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileBatchIOCancel
+    _check_or_init_cufile()
+    if __cuFileBatchIOCancel == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileBatchIOCancel is not found")
+    return (<CUfileError_t (*)(CUfileBatchHandle_t) noexcept nogil>__cuFileBatchIOCancel)(
+        batch_idp)
+
+
+@cython.show_performance_hints(False)
+cdef void _cuFileBatchIODestroy(CUfileBatchHandle_t batch_idp) except* nogil:
+    global __cuFileBatchIODestroy
+    _check_or_init_cufile()
+    if __cuFileBatchIODestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileBatchIODestroy is not found")
+    (<void (*)(CUfileBatchHandle_t) noexcept nogil>__cuFileBatchIODestroy)(
+        batch_idp)
+
+
+cdef CUfileError_t _cuFileReadAsync(CUfileHandle_t fh, void* bufPtr_base, size_t* size_p, off_t* file_offset_p, off_t* bufPtr_offset_p, ssize_t* bytes_read_p, CUstream stream) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileReadAsync
+    _check_or_init_cufile()
+    if __cuFileReadAsync == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileReadAsync is not found")
+    return (<CUfileError_t (*)(CUfileHandle_t, void*, size_t*, off_t*, off_t*, ssize_t*, CUstream) noexcept nogil>__cuFileReadAsync)(
+        fh, bufPtr_base, size_p, file_offset_p, bufPtr_offset_p, bytes_read_p, stream)
+
+
+cdef CUfileError_t _cuFileWriteAsync(CUfileHandle_t fh, void* bufPtr_base, size_t* size_p, off_t* file_offset_p, off_t* bufPtr_offset_p, ssize_t* bytes_written_p, CUstream stream) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileWriteAsync
+    _check_or_init_cufile()
+    if __cuFileWriteAsync == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileWriteAsync is not found")
+    return (<CUfileError_t (*)(CUfileHandle_t, void*, size_t*, off_t*, off_t*, ssize_t*, CUstream) noexcept nogil>__cuFileWriteAsync)(
+        fh, bufPtr_base, size_p, file_offset_p, bufPtr_offset_p, bytes_written_p, stream)
+
+
+cdef CUfileError_t _cuFileStreamRegister(CUstream stream, unsigned flags) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileStreamRegister
+    _check_or_init_cufile()
+    if __cuFileStreamRegister == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileStreamRegister is not found")
+    return (<CUfileError_t (*)(CUstream, unsigned) noexcept nogil>__cuFileStreamRegister)(
+        stream, flags)
+
+
+cdef CUfileError_t _cuFileStreamDeregister(CUstream stream) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileStreamDeregister
+    _check_or_init_cufile()
+    if __cuFileStreamDeregister == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileStreamDeregister is not found")
+    return (<CUfileError_t (*)(CUstream) noexcept nogil>__cuFileStreamDeregister)(
+        stream)
+
+
+cdef CUfileError_t _cuFileGetVersion(int* version) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileGetVersion
+    _check_or_init_cufile()
+    if __cuFileGetVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileGetVersion is not found")
+    return (<CUfileError_t (*)(int*) noexcept nogil>__cuFileGetVersion)(
+        version)
+
+
+cdef CUfileError_t _cuFileGetParameterSizeT(CUFileSizeTConfigParameter_t param, size_t* value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileGetParameterSizeT
+    _check_or_init_cufile()
+    if __cuFileGetParameterSizeT == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileGetParameterSizeT is not found")
+    return (<CUfileError_t (*)(CUFileSizeTConfigParameter_t, size_t*) noexcept nogil>__cuFileGetParameterSizeT)(
+        param, value)
+
+
+cdef CUfileError_t _cuFileGetParameterBool(CUFileBoolConfigParameter_t param, cpp_bool* value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileGetParameterBool
+    _check_or_init_cufile()
+    if __cuFileGetParameterBool == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileGetParameterBool is not found")
+    return (<CUfileError_t (*)(CUFileBoolConfigParameter_t, cpp_bool*) noexcept nogil>__cuFileGetParameterBool)(
+        param, value)
+
+
+cdef CUfileError_t _cuFileGetParameterString(CUFileStringConfigParameter_t param, char* desc_str, int len) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileGetParameterString
+    _check_or_init_cufile()
+    if __cuFileGetParameterString == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileGetParameterString is not found")
+    return (<CUfileError_t (*)(CUFileStringConfigParameter_t, char*, int) noexcept nogil>__cuFileGetParameterString)(
+        param, desc_str, len)
+
+
+cdef CUfileError_t _cuFileSetParameterSizeT(CUFileSizeTConfigParameter_t param, size_t value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileSetParameterSizeT
+    _check_or_init_cufile()
+    if __cuFileSetParameterSizeT == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileSetParameterSizeT is not found")
+    return (<CUfileError_t (*)(CUFileSizeTConfigParameter_t, size_t) noexcept nogil>__cuFileSetParameterSizeT)(
+        param, value)
+
+
+cdef CUfileError_t _cuFileSetParameterBool(CUFileBoolConfigParameter_t param, cpp_bool value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileSetParameterBool
+    _check_or_init_cufile()
+    if __cuFileSetParameterBool == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileSetParameterBool is not found")
+    return (<CUfileError_t (*)(CUFileBoolConfigParameter_t, cpp_bool) noexcept nogil>__cuFileSetParameterBool)(
+        param, value)
+
+
+cdef CUfileError_t _cuFileSetParameterString(CUFileStringConfigParameter_t param, const char* desc_str) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    global __cuFileSetParameterString
+    _check_or_init_cufile()
+    if __cuFileSetParameterString == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cuFileSetParameterString is not found")
+    return (<CUfileError_t (*)(CUFileStringConfigParameter_t, const char*) noexcept nogil>__cuFileSetParameterString)(
+        param, desc_str)
diff --git a/cuda_bindings/cuda/bindings/cufile.pxd b/cuda_bindings/cuda/bindings/cufile.pxd
new file mode 100644
index 000000000..69fc6fc67
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cufile.pxd
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 12.9.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .cycufile cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+ctypedef CUfileHandle_t Handle
+ctypedef CUfileBatchHandle_t BatchHandle
+ctypedef CUfileError_t Error
+ctypedef cufileRDMAInfo_t RDMAInfo
+ctypedef CUfileFSOps_t FSOps
+ctypedef CUfileDrvProps_t DrvProps
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+ctypedef CUfileOpError _OpError
+ctypedef CUfileDriverStatusFlags_t _DriverStatusFlags
+ctypedef CUfileDriverControlFlags_t _DriverControlFlags
+ctypedef CUfileFeatureFlags_t _FeatureFlags
+ctypedef CUfileFileHandleType _FileHandleType
+ctypedef CUfileOpcode_t _Opcode
+ctypedef CUfileStatus_t _Status
+ctypedef CUfileBatchMode_t _BatchMode
+ctypedef CUFileSizeTConfigParameter_t _SizeTConfigParameter
+ctypedef CUFileBoolConfigParameter_t _BoolConfigParameter
+ctypedef CUFileStringConfigParameter_t _StringConfigParameter
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef intptr_t handle_register(intptr_t descr) except? 0
+cpdef void handle_deregister(intptr_t fh) except*
+cpdef buf_register(intptr_t buf_ptr_base, size_t length, int flags)
+cpdef buf_deregister(intptr_t buf_ptr_base)
+cpdef read(intptr_t fh, intptr_t buf_ptr_base, size_t size, off_t file_offset, off_t buf_ptr_offset)
+cpdef write(intptr_t fh, intptr_t buf_ptr_base, size_t size, off_t file_offset, off_t buf_ptr_offset)
+cpdef driver_open()
+cpdef use_count()
+cpdef driver_get_properties(intptr_t props)
+cpdef driver_set_poll_mode(bint poll, size_t poll_threshold_size)
+cpdef driver_set_max_direct_io_size(size_t max_direct_io_size)
+cpdef driver_set_max_cache_size(size_t max_cache_size)
+cpdef driver_set_max_pinned_mem_size(size_t max_pinned_size)
+cpdef intptr_t batch_io_set_up(unsigned nr) except? 0
+cpdef batch_io_submit(intptr_t batch_idp, unsigned nr, intptr_t iocbp, unsigned int flags)
+cpdef batch_io_get_status(intptr_t batch_idp, unsigned min_nr, intptr_t nr, intptr_t iocbp, intptr_t timeout)
+cpdef batch_io_cancel(intptr_t batch_idp)
+cpdef void batch_io_destroy(intptr_t batch_idp) except*
+cpdef read_async(intptr_t fh, intptr_t buf_ptr_base, intptr_t size_p, intptr_t file_offset_p, intptr_t buf_ptr_offset_p, intptr_t bytes_read_p, intptr_t stream)
+cpdef write_async(intptr_t fh, intptr_t buf_ptr_base, intptr_t size_p, intptr_t file_offset_p, intptr_t buf_ptr_offset_p, intptr_t bytes_written_p, intptr_t stream)
+cpdef stream_register(intptr_t stream, unsigned flags)
+cpdef stream_deregister(intptr_t stream)
+cpdef int get_version() except? 0
+cpdef size_t get_parameter_size_t(int param) except? 0
+cpdef bint get_parameter_bool(int param) except? 0
+cpdef str get_parameter_string(int param, int len)
+cpdef set_parameter_size_t(int param, size_t value)
+cpdef set_parameter_bool(int param, bint value)
+cpdef set_parameter_string(int param, intptr_t desc_str)
diff --git a/cuda_bindings/cuda/bindings/cufile.pyx b/cuda_bindings/cuda/bindings/cufile.pyx
new file mode 100644
index 000000000..9fe54009e
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cufile.pyx
@@ -0,0 +1,1296 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 12.9.0. Do not modify it directly.
+
+cimport cython  # NOQA
+from libc cimport errno
+from ._internal.utils cimport (get_buffer_pointer, get_nested_resource_ptr,
+                               nested_resource)
+import numpy as _numpy
+from cpython cimport buffer as _buffer
+from cpython.memoryview cimport PyMemoryView_FromMemory
+from enum import IntEnum as _IntEnum
+
+import cython
+
+from cuda.bindings.driver import CUresult as pyCUresult
+
+
+###############################################################################
+# POD
+###############################################################################
+
+_py_anon_pod1_dtype = _numpy.dtype((
+    _numpy.dtype((_numpy.void, sizeof((<CUfileDescr_t*>NULL).handle))),
+    {
+        "fd": (_numpy.int32, 0),
+        "handle": (_numpy.intp, 0),
+    }
+    ))
+
+
+cdef class _py_anon_pod1:
+    """Empty-initialize an instance of `_anon_pod1`.
+
+
+    .. seealso:: `_anon_pod1`
+    """
+    cdef:
+        readonly object _data
+
+    def __init__(self):
+        arr = _numpy.empty(1, dtype=_py_anon_pod1_dtype)
+        self._data = arr.view(_numpy.recarray)
+        assert self._data.itemsize == sizeof((<CUfileDescr_t*>NULL).handle), \
+            f"itemsize {self._data.itemsize} mismatches union size {sizeof((<CUfileDescr_t*>NULL).handle)}"
+
+    def __repr__(self):
+        return f"<{__name__}._py_anon_pod1 object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return self._data.ctypes.data
+
+    def __int__(self):
+        return self._data.ctypes.data
+
+    def __eq__(self, other):
+        if not isinstance(other, _py_anon_pod1):
+            return False
+        if self._data.size != other._data.size:
+            return False
+        if self._data.dtype != other._data.dtype:
+            return False
+        return bool((self._data == other._data).all())
+
+    @property
+    def fd(self):
+        """int: """
+        return int(self._data.fd[0])
+
+    @fd.setter
+    def fd(self, val):
+        self._data.fd = val
+
+    @property
+    def handle(self):
+        """int: """
+        return int(self._data.handle[0])
+
+    @handle.setter
+    def handle(self, val):
+        self._data.handle = val
+
+    def __setitem__(self, key, val):
+        self._data[key] = val
+
+    @staticmethod
+    def from_data(data):
+        """Create an _py_anon_pod1 instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a 1D array of dtype `_py_anon_pod1_dtype` holding the data.
+        """
+        cdef _py_anon_pod1 obj = _py_anon_pod1.__new__(_py_anon_pod1)
+        if not isinstance(data, (_numpy.ndarray, _numpy.recarray)):
+            raise TypeError("data argument must be a NumPy ndarray")
+        if data.ndim != 1:
+            raise ValueError("data array must be 1D")
+        if data.dtype != _py_anon_pod1_dtype:
+            raise ValueError("data array must be of dtype _py_anon_pod1_dtype")
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False):
+        """Create an _py_anon_pod1 instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef _py_anon_pod1 obj = _py_anon_pod1.__new__(_py_anon_pod1)
+        cdef flag = _buffer.PyBUF_READ if readonly else _buffer.PyBUF_WRITE
+        cdef object buf = PyMemoryView_FromMemory(
+            <char*>ptr, sizeof((<CUfileDescr_t*>NULL).handle), flag)
+        data = _numpy.ndarray((1,), buffer=buf,
+                              dtype=_py_anon_pod1_dtype)
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+
+_py_anon_pod3_dtype = _numpy.dtype([
+    ("dev_ptr_base", _numpy.intp, ),
+    ("file_offset", _numpy.int64, ),
+    ("dev_ptr_offset", _numpy.int64, ),
+    ("size_", _numpy.uint64, ),
+    ], align=True)
+
+
+cdef class _py_anon_pod3:
+    """Empty-initialize an instance of `_anon_pod3`.
+
+
+    .. seealso:: `_anon_pod3`
+    """
+    cdef:
+        readonly object _data
+
+    def __init__(self):
+        arr = _numpy.empty(1, dtype=_py_anon_pod3_dtype)
+        self._data = arr.view(_numpy.recarray)
+        assert self._data.itemsize == sizeof((<CUfileIOParams_t*>NULL).u.batch), \
+            f"itemsize {self._data.itemsize} mismatches struct size {sizeof((<CUfileIOParams_t*>NULL).u.batch)}"
+
+    def __repr__(self):
+        return f"<{__name__}._py_anon_pod3 object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return self._data.ctypes.data
+
+    def __int__(self):
+        return self._data.ctypes.data
+
+    def __eq__(self, other):
+        if not isinstance(other, _py_anon_pod3):
+            return False
+        if self._data.size != other._data.size:
+            return False
+        if self._data.dtype != other._data.dtype:
+            return False
+        return bool((self._data == other._data).all())
+
+    @property
+    def dev_ptr_base(self):
+        """int: """
+        return int(self._data.dev_ptr_base[0])
+
+    @dev_ptr_base.setter
+    def dev_ptr_base(self, val):
+        self._data.dev_ptr_base = val
+
+    @property
+    def file_offset(self):
+        """int: """
+        return int(self._data.file_offset[0])
+
+    @file_offset.setter
+    def file_offset(self, val):
+        self._data.file_offset = val
+
+    @property
+    def dev_ptr_offset(self):
+        """int: """
+        return int(self._data.dev_ptr_offset[0])
+
+    @dev_ptr_offset.setter
+    def dev_ptr_offset(self, val):
+        self._data.dev_ptr_offset = val
+
+    @property
+    def size_(self):
+        """int: """
+        return int(self._data.size_[0])
+
+    @size_.setter
+    def size_(self, val):
+        self._data.size_ = val
+
+    def __setitem__(self, key, val):
+        self._data[key] = val
+
+    @staticmethod
+    def from_data(data):
+        """Create an _py_anon_pod3 instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a 1D array of dtype `_py_anon_pod3_dtype` holding the data.
+        """
+        cdef _py_anon_pod3 obj = _py_anon_pod3.__new__(_py_anon_pod3)
+        if not isinstance(data, (_numpy.ndarray, _numpy.recarray)):
+            raise TypeError("data argument must be a NumPy ndarray")
+        if data.ndim != 1:
+            raise ValueError("data array must be 1D")
+        if data.dtype != _py_anon_pod3_dtype:
+            raise ValueError("data array must be of dtype _py_anon_pod3_dtype")
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False):
+        """Create an _py_anon_pod3 instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef _py_anon_pod3 obj = _py_anon_pod3.__new__(_py_anon_pod3)
+        cdef flag = _buffer.PyBUF_READ if readonly else _buffer.PyBUF_WRITE
+        cdef object buf = PyMemoryView_FromMemory(
+            <char*>ptr, sizeof((<CUfileIOParams_t*>NULL).u.batch), flag)
+        data = _numpy.ndarray((1,), buffer=buf,
+                              dtype=_py_anon_pod3_dtype)
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+
+io_events_dtype = _numpy.dtype([
+    ("cookie", _numpy.intp, ),
+    ("status", _numpy.int32, ),
+    ("ret", _numpy.uint64, ),
+    ], align=True)
+
+
+cdef class IOEvents:
+    """Empty-initialize an array of `CUfileIOEvents_t`.
+
+    The resulting object is of length `size` and of dtype `io_events_dtype`.
+    If default-constructed, the instance represents a single struct.
+
+    Args:
+        size (int): number of structs, default=1.
+
+
+    .. seealso:: `CUfileIOEvents_t`
+    """
+    cdef:
+        readonly object _data
+
+    def __init__(self, size=1):
+        arr = _numpy.empty(size, dtype=io_events_dtype)
+        self._data = arr.view(_numpy.recarray)
+        assert self._data.itemsize == sizeof(CUfileIOEvents_t), \
+            f"itemsize {self._data.itemsize} mismatches struct size {sizeof(CUfileIOEvents_t)}"
+
+    def __repr__(self):
+        if self._data.size > 1:
+            return f"<{__name__}.IOEvents_Array_{self._data.size} object at {hex(id(self))}>"
+        else:
+            return f"<{__name__}.IOEvents object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return self._data.ctypes.data
+
+    def __int__(self):
+        if self._data.size > 1:
+            raise TypeError("int() argument must be a bytes-like object of size 1. "
+                            "To get the pointer address of an array, use .ptr")
+        return self._data.ctypes.data
+
+    def __len__(self):
+        return self._data.size
+
+    def __eq__(self, other):
+        if not isinstance(other, IOEvents):
+            return False
+        if self._data.size != other._data.size:
+            return False
+        if self._data.dtype != other._data.dtype:
+            return False
+        return bool((self._data == other._data).all())
+
+    @property
+    def cookie(self):
+        """Union[~_numpy.intp, int]: """
+        if self._data.size == 1:
+            return int(self._data.cookie[0])
+        return self._data.cookie
+
+    @cookie.setter
+    def cookie(self, val):
+        self._data.cookie = val
+
+    @property
+    def status(self):
+        """Union[~_numpy.int32, int]: """
+        if self._data.size == 1:
+            return int(self._data.status[0])
+        return self._data.status
+
+    @status.setter
+    def status(self, val):
+        self._data.status = val
+
+    @property
+    def ret(self):
+        """Union[~_numpy.uint64, int]: """
+        if self._data.size == 1:
+            return int(self._data.ret[0])
+        return self._data.ret
+
+    @ret.setter
+    def ret(self, val):
+        self._data.ret = val
+
+    def __getitem__(self, key):
+        if isinstance(key, int):
+            size = self._data.size
+            if key >= size or key <= -(size+1):
+                raise IndexError("index is out of bounds")
+            if key < 0:
+                key += size
+            return IOEvents.from_data(self._data[key:key+1])
+        out = self._data[key]
+        if isinstance(out, _numpy.recarray) and out.dtype == io_events_dtype:
+            return IOEvents.from_data(out)
+        return out
+
+    def __setitem__(self, key, val):
+        self._data[key] = val
+
+    @staticmethod
+    def from_data(data):
+        """Create an IOEvents instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a 1D array of dtype `io_events_dtype` holding the data.
+        """
+        cdef IOEvents obj = IOEvents.__new__(IOEvents)
+        if not isinstance(data, (_numpy.ndarray, _numpy.recarray)):
+            raise TypeError("data argument must be a NumPy ndarray")
+        if data.ndim != 1:
+            raise ValueError("data array must be 1D")
+        if data.dtype != io_events_dtype:
+            raise ValueError("data array must be of dtype io_events_dtype")
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, size_t size=1, bint readonly=False):
+        """Create an IOEvents instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            size (int): number of structs, default=1.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef IOEvents obj = IOEvents.__new__(IOEvents)
+        cdef flag = _buffer.PyBUF_READ if readonly else _buffer.PyBUF_WRITE
+        cdef object buf = PyMemoryView_FromMemory(
+            <char*>ptr, sizeof(CUfileIOEvents_t) * size, flag)
+        data = _numpy.ndarray((size,), buffer=buf,
+                              dtype=io_events_dtype)
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+
+descr_dtype = _numpy.dtype([
+    ("type", _numpy.int32, ),
+    ("handle", _py_anon_pod1_dtype, ),
+    ("fs_ops", _numpy.intp, ),
+    ], align=True)
+
+
+cdef class Descr:
+    """Empty-initialize an array of `CUfileDescr_t`.
+
+    The resulting object is of length `size` and of dtype `descr_dtype`.
+    If default-constructed, the instance represents a single struct.
+
+    Args:
+        size (int): number of structs, default=1.
+
+
+    .. seealso:: `CUfileDescr_t`
+    """
+    cdef:
+        readonly object _data
+
+    def __init__(self, size=1):
+        arr = _numpy.empty(size, dtype=descr_dtype)
+        self._data = arr.view(_numpy.recarray)
+        assert self._data.itemsize == sizeof(CUfileDescr_t), \
+            f"itemsize {self._data.itemsize} mismatches struct size {sizeof(CUfileDescr_t)}"
+
+    def __repr__(self):
+        if self._data.size > 1:
+            return f"<{__name__}.Descr_Array_{self._data.size} object at {hex(id(self))}>"
+        else:
+            return f"<{__name__}.Descr object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return self._data.ctypes.data
+
+    def __int__(self):
+        if self._data.size > 1:
+            raise TypeError("int() argument must be a bytes-like object of size 1. "
+                            "To get the pointer address of an array, use .ptr")
+        return self._data.ctypes.data
+
+    def __len__(self):
+        return self._data.size
+
+    def __eq__(self, other):
+        if not isinstance(other, Descr):
+            return False
+        if self._data.size != other._data.size:
+            return False
+        if self._data.dtype != other._data.dtype:
+            return False
+        return bool((self._data == other._data).all())
+
+    @property
+    def type(self):
+        """Union[~_numpy.int32, int]: """
+        if self._data.size == 1:
+            return int(self._data.type[0])
+        return self._data.type
+
+    @type.setter
+    def type(self, val):
+        self._data.type = val
+
+    @property
+    def handle(self):
+        """_py_anon_pod1_dtype: """
+        return self._data.handle
+
+    @handle.setter
+    def handle(self, val):
+        self._data.handle = val
+
+    @property
+    def fs_ops(self):
+        """Union[~_numpy.intp, int]: """
+        if self._data.size == 1:
+            return int(self._data.fs_ops[0])
+        return self._data.fs_ops
+
+    @fs_ops.setter
+    def fs_ops(self, val):
+        self._data.fs_ops = val
+
+    def __getitem__(self, key):
+        if isinstance(key, int):
+            size = self._data.size
+            if key >= size or key <= -(size+1):
+                raise IndexError("index is out of bounds")
+            if key < 0:
+                key += size
+            return Descr.from_data(self._data[key:key+1])
+        out = self._data[key]
+        if isinstance(out, _numpy.recarray) and out.dtype == descr_dtype:
+            return Descr.from_data(out)
+        return out
+
+    def __setitem__(self, key, val):
+        self._data[key] = val
+
+    @staticmethod
+    def from_data(data):
+        """Create an Descr instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a 1D array of dtype `descr_dtype` holding the data.
+        """
+        cdef Descr obj = Descr.__new__(Descr)
+        if not isinstance(data, (_numpy.ndarray, _numpy.recarray)):
+            raise TypeError("data argument must be a NumPy ndarray")
+        if data.ndim != 1:
+            raise ValueError("data array must be 1D")
+        if data.dtype != descr_dtype:
+            raise ValueError("data array must be of dtype descr_dtype")
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, size_t size=1, bint readonly=False):
+        """Create an Descr instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            size (int): number of structs, default=1.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef Descr obj = Descr.__new__(Descr)
+        cdef flag = _buffer.PyBUF_READ if readonly else _buffer.PyBUF_WRITE
+        cdef object buf = PyMemoryView_FromMemory(
+            <char*>ptr, sizeof(CUfileDescr_t) * size, flag)
+        data = _numpy.ndarray((size,), buffer=buf,
+                              dtype=descr_dtype)
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+
+_py_anon_pod2_dtype = _numpy.dtype((
+    _numpy.dtype((_numpy.void, sizeof((<CUfileIOParams_t*>NULL).u))),
+    {
+        "batch": (_py_anon_pod3_dtype, 0),
+    }
+    ))
+
+
+cdef class _py_anon_pod2:
+    """Empty-initialize an instance of `_anon_pod2`.
+
+
+    .. seealso:: `_anon_pod2`
+    """
+    cdef:
+        readonly object _data
+
+        readonly object _batch
+
+    def __init__(self):
+        arr = _numpy.empty(1, dtype=_py_anon_pod2_dtype)
+        self._data = arr.view(_numpy.recarray)
+        assert self._data.itemsize == sizeof((<CUfileIOParams_t*>NULL).u), \
+            f"itemsize {self._data.itemsize} mismatches union size {sizeof((<CUfileIOParams_t*>NULL).u)}"
+
+    def __repr__(self):
+        return f"<{__name__}._py_anon_pod2 object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return self._data.ctypes.data
+
+    def __int__(self):
+        return self._data.ctypes.data
+
+    def __eq__(self, other):
+        if not isinstance(other, _py_anon_pod2):
+            return False
+        if self._data.size != other._data.size:
+            return False
+        if self._data.dtype != other._data.dtype:
+            return False
+        return bool((self._data == other._data).all())
+
+    @property
+    def batch(self):
+        """_py_anon_pod3: """
+        return self._batch
+
+    def __setitem__(self, key, val):
+        self._data[key] = val
+
+    @staticmethod
+    def from_data(data):
+        """Create an _py_anon_pod2 instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a 1D array of dtype `_py_anon_pod2_dtype` holding the data.
+        """
+        cdef _py_anon_pod2 obj = _py_anon_pod2.__new__(_py_anon_pod2)
+        if not isinstance(data, (_numpy.ndarray, _numpy.recarray)):
+            raise TypeError("data argument must be a NumPy ndarray")
+        if data.ndim != 1:
+            raise ValueError("data array must be 1D")
+        if data.dtype != _py_anon_pod2_dtype:
+            raise ValueError("data array must be of dtype _py_anon_pod2_dtype")
+        obj._data = data.view(_numpy.recarray)
+
+        batch_addr = obj._data.batch[0].__array_interface__['data'][0]
+        obj._batch = _py_anon_pod3.from_ptr(batch_addr)
+        return obj
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False):
+        """Create an _py_anon_pod2 instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef _py_anon_pod2 obj = _py_anon_pod2.__new__(_py_anon_pod2)
+        cdef flag = _buffer.PyBUF_READ if readonly else _buffer.PyBUF_WRITE
+        cdef object buf = PyMemoryView_FromMemory(
+            <char*>ptr, sizeof((<CUfileIOParams_t*>NULL).u), flag)
+        data = _numpy.ndarray((1,), buffer=buf,
+                              dtype=_py_anon_pod2_dtype)
+        obj._data = data.view(_numpy.recarray)
+
+        batch_addr = obj._data.batch[0].__array_interface__['data'][0]
+        obj._batch = _py_anon_pod3.from_ptr(batch_addr)
+        return obj
+
+
+io_params_dtype = _numpy.dtype([
+    ("mode", _numpy.int32, ),
+    ("u", _py_anon_pod2_dtype, ),
+    ("fh", _numpy.intp, ),
+    ("opcode", _numpy.int32, ),
+    ("cookie", _numpy.intp, ),
+    ], align=True)
+
+
+cdef class IOParams:
+    """Empty-initialize an array of `CUfileIOParams_t`.
+
+    The resulting object is of length `size` and of dtype `io_params_dtype`.
+    If default-constructed, the instance represents a single struct.
+
+    Args:
+        size (int): number of structs, default=1.
+
+
+    .. seealso:: `CUfileIOParams_t`
+    """
+    cdef:
+        readonly object _data
+
+    def __init__(self, size=1):
+        arr = _numpy.empty(size, dtype=io_params_dtype)
+        self._data = arr.view(_numpy.recarray)
+        assert self._data.itemsize == sizeof(CUfileIOParams_t), \
+            f"itemsize {self._data.itemsize} mismatches struct size {sizeof(CUfileIOParams_t)}"
+
+    def __repr__(self):
+        if self._data.size > 1:
+            return f"<{__name__}.IOParams_Array_{self._data.size} object at {hex(id(self))}>"
+        else:
+            return f"<{__name__}.IOParams object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return self._data.ctypes.data
+
+    def __int__(self):
+        if self._data.size > 1:
+            raise TypeError("int() argument must be a bytes-like object of size 1. "
+                            "To get the pointer address of an array, use .ptr")
+        return self._data.ctypes.data
+
+    def __len__(self):
+        return self._data.size
+
+    def __eq__(self, other):
+        if not isinstance(other, IOParams):
+            return False
+        if self._data.size != other._data.size:
+            return False
+        if self._data.dtype != other._data.dtype:
+            return False
+        return bool((self._data == other._data).all())
+
+    @property
+    def mode(self):
+        """Union[~_numpy.int32, int]: """
+        if self._data.size == 1:
+            return int(self._data.mode[0])
+        return self._data.mode
+
+    @mode.setter
+    def mode(self, val):
+        self._data.mode = val
+
+    @property
+    def u(self):
+        """_py_anon_pod2_dtype: """
+        return self._data.u
+
+    @u.setter
+    def u(self, val):
+        self._data.u = val
+
+    @property
+    def fh(self):
+        """Union[~_numpy.intp, int]: """
+        if self._data.size == 1:
+            return int(self._data.fh[0])
+        return self._data.fh
+
+    @fh.setter
+    def fh(self, val):
+        self._data.fh = val
+
+    @property
+    def opcode(self):
+        """Union[~_numpy.int32, int]: """
+        if self._data.size == 1:
+            return int(self._data.opcode[0])
+        return self._data.opcode
+
+    @opcode.setter
+    def opcode(self, val):
+        self._data.opcode = val
+
+    @property
+    def cookie(self):
+        """Union[~_numpy.intp, int]: """
+        if self._data.size == 1:
+            return int(self._data.cookie[0])
+        return self._data.cookie
+
+    @cookie.setter
+    def cookie(self, val):
+        self._data.cookie = val
+
+    def __getitem__(self, key):
+        if isinstance(key, int):
+            size = self._data.size
+            if key >= size or key <= -(size+1):
+                raise IndexError("index is out of bounds")
+            if key < 0:
+                key += size
+            return IOParams.from_data(self._data[key:key+1])
+        out = self._data[key]
+        if isinstance(out, _numpy.recarray) and out.dtype == io_params_dtype:
+            return IOParams.from_data(out)
+        return out
+
+    def __setitem__(self, key, val):
+        self._data[key] = val
+
+    @staticmethod
+    def from_data(data):
+        """Create an IOParams instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a 1D array of dtype `io_params_dtype` holding the data.
+        """
+        cdef IOParams obj = IOParams.__new__(IOParams)
+        if not isinstance(data, (_numpy.ndarray, _numpy.recarray)):
+            raise TypeError("data argument must be a NumPy ndarray")
+        if data.ndim != 1:
+            raise ValueError("data array must be 1D")
+        if data.dtype != io_params_dtype:
+            raise ValueError("data array must be of dtype io_params_dtype")
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, size_t size=1, bint readonly=False):
+        """Create an IOParams instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            size (int): number of structs, default=1.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef IOParams obj = IOParams.__new__(IOParams)
+        cdef flag = _buffer.PyBUF_READ if readonly else _buffer.PyBUF_WRITE
+        cdef object buf = PyMemoryView_FromMemory(
+            <char*>ptr, sizeof(CUfileIOParams_t) * size, flag)
+        data = _numpy.ndarray((size,), buffer=buf,
+                              dtype=io_params_dtype)
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+
+# Hack: Overwrite the generated descr_dtype, which NumPy deduced the offset wrong.
+descr_dtype = _numpy.dtype({
+    "names": ['type', 'handle', 'fs_ops'],
+    "formats": [_numpy.int32, _py_anon_pod1_dtype, _numpy.intp],
+    "offsets": [0, 8, 16],
+}, align=True)
+
+# Hack: Overwrite the generated io_params_dtype, which NumPy deduced the offset wrong.
+io_params_dtype = _numpy.dtype({
+    "names": ['mode', 'u', 'fh', 'opcode', 'cookie'],
+    "formats": [_numpy.int32, _py_anon_pod2_dtype, _numpy.intp, _numpy.int32, _numpy.intp],
+    "offsets": [0, 8, 40, 48, 56],
+}, align=True)
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+class OpError(_IntEnum):
+    """See `CUfileOpError`."""
+    SUCCESS = CU_FILE_SUCCESS
+    DRIVER_NOT_INITIALIZED = CU_FILE_DRIVER_NOT_INITIALIZED
+    DRIVER_INVALID_PROPS = CU_FILE_DRIVER_INVALID_PROPS
+    DRIVER_UNSUPPORTED_LIMIT = CU_FILE_DRIVER_UNSUPPORTED_LIMIT
+    DRIVER_VERSION_MISMATCH = CU_FILE_DRIVER_VERSION_MISMATCH
+    DRIVER_VERSION_READ_ERROR = CU_FILE_DRIVER_VERSION_READ_ERROR
+    DRIVER_CLOSING = CU_FILE_DRIVER_CLOSING
+    PLATFORM_NOT_SUPPORTED = CU_FILE_PLATFORM_NOT_SUPPORTED
+    IO_NOT_SUPPORTED = CU_FILE_IO_NOT_SUPPORTED
+    DEVICE_NOT_SUPPORTED = CU_FILE_DEVICE_NOT_SUPPORTED
+    NVFS_DRIVER_ERROR = CU_FILE_NVFS_DRIVER_ERROR
+    CUDA_DRIVER_ERROR = CU_FILE_CUDA_DRIVER_ERROR
+    CUDA_POINTER_INVALID = CU_FILE_CUDA_POINTER_INVALID
+    CUDA_MEMORY_TYPE_INVALID = CU_FILE_CUDA_MEMORY_TYPE_INVALID
+    CUDA_POINTER_RANGE_ERROR = CU_FILE_CUDA_POINTER_RANGE_ERROR
+    CUDA_CONTEXT_MISMATCH = CU_FILE_CUDA_CONTEXT_MISMATCH
+    INVALID_MAPPING_SIZE = CU_FILE_INVALID_MAPPING_SIZE
+    INVALID_MAPPING_RANGE = CU_FILE_INVALID_MAPPING_RANGE
+    INVALID_FILE_TYPE = CU_FILE_INVALID_FILE_TYPE
+    INVALID_FILE_OPEN_FLAG = CU_FILE_INVALID_FILE_OPEN_FLAG
+    DIO_NOT_SET = CU_FILE_DIO_NOT_SET
+    INVALID_VALUE = CU_FILE_INVALID_VALUE
+    MEMORY_ALREADY_REGISTERED = CU_FILE_MEMORY_ALREADY_REGISTERED
+    MEMORY_NOT_REGISTERED = CU_FILE_MEMORY_NOT_REGISTERED
+    PERMISSION_DENIED = CU_FILE_PERMISSION_DENIED
+    DRIVER_ALREADY_OPEN = CU_FILE_DRIVER_ALREADY_OPEN
+    HANDLE_NOT_REGISTERED = CU_FILE_HANDLE_NOT_REGISTERED
+    HANDLE_ALREADY_REGISTERED = CU_FILE_HANDLE_ALREADY_REGISTERED
+    DEVICE_NOT_FOUND = CU_FILE_DEVICE_NOT_FOUND
+    INTERNAL_ERROR = CU_FILE_INTERNAL_ERROR
+    GETNEWFD_FAILED = CU_FILE_GETNEWFD_FAILED
+    NVFS_SETUP_ERROR = CU_FILE_NVFS_SETUP_ERROR
+    IO_DISABLED = CU_FILE_IO_DISABLED
+    BATCH_SUBMIT_FAILED = CU_FILE_BATCH_SUBMIT_FAILED
+    GPU_MEMORY_PINNING_FAILED = CU_FILE_GPU_MEMORY_PINNING_FAILED
+    BATCH_FULL = CU_FILE_BATCH_FULL
+    ASYNC_NOT_SUPPORTED = CU_FILE_ASYNC_NOT_SUPPORTED
+    IO_MAX_ERROR = CU_FILE_IO_MAX_ERROR
+
+class DriverStatusFlags(_IntEnum):
+    """See `CUfileDriverStatusFlags_t`."""
+    LUSTRE_SUPPORTED = CU_FILE_LUSTRE_SUPPORTED
+    WEKAFS_SUPPORTED = CU_FILE_WEKAFS_SUPPORTED
+    NFS_SUPPORTED = CU_FILE_NFS_SUPPORTED
+    GPFS_SUPPORTED = CU_FILE_GPFS_SUPPORTED
+    NVME_SUPPORTED = CU_FILE_NVME_SUPPORTED
+    NVMEOF_SUPPORTED = CU_FILE_NVMEOF_SUPPORTED
+    SCSI_SUPPORTED = CU_FILE_SCSI_SUPPORTED
+    SCALEFLUX_CSD_SUPPORTED = CU_FILE_SCALEFLUX_CSD_SUPPORTED
+    NVMESH_SUPPORTED = CU_FILE_NVMESH_SUPPORTED
+    BEEGFS_SUPPORTED = CU_FILE_BEEGFS_SUPPORTED
+    NVME_P2P_SUPPORTED = CU_FILE_NVME_P2P_SUPPORTED
+    SCATEFS_SUPPORTED = CU_FILE_SCATEFS_SUPPORTED
+
+class DriverControlFlags(_IntEnum):
+    """See `CUfileDriverControlFlags_t`."""
+    USE_POLL_MODE = CU_FILE_USE_POLL_MODE
+    ALLOW_COMPAT_MODE = CU_FILE_ALLOW_COMPAT_MODE
+
+class FeatureFlags(_IntEnum):
+    """See `CUfileFeatureFlags_t`."""
+    DYN_ROUTING_SUPPORTED = CU_FILE_DYN_ROUTING_SUPPORTED
+    BATCH_IO_SUPPORTED = CU_FILE_BATCH_IO_SUPPORTED
+    STREAMS_SUPPORTED = CU_FILE_STREAMS_SUPPORTED
+    PARALLEL_IO_SUPPORTED = CU_FILE_PARALLEL_IO_SUPPORTED
+
+class FileHandleType(_IntEnum):
+    """See `CUfileFileHandleType`."""
+    OPAQUE_FD = CU_FILE_HANDLE_TYPE_OPAQUE_FD
+    OPAQUE_WIN32 = CU_FILE_HANDLE_TYPE_OPAQUE_WIN32
+    USERSPACE_FS = CU_FILE_HANDLE_TYPE_USERSPACE_FS
+
+class Opcode(_IntEnum):
+    """See `CUfileOpcode_t`."""
+    READ = CUFILE_READ
+    WRITE = CUFILE_WRITE
+
+class Status(_IntEnum):
+    """See `CUfileStatus_t`."""
+    WAITING = CUFILE_WAITING
+    PENDING = CUFILE_PENDING
+    INVALID = CUFILE_INVALID
+    CANCELED = CUFILE_CANCELED
+    COMPLETE = CUFILE_COMPLETE
+    TIMEOUT = CUFILE_TIMEOUT
+    FAILED = CUFILE_FAILED
+
+class BatchMode(_IntEnum):
+    """See `CUfileBatchMode_t`."""
+    BATCH = CUFILE_BATCH
+
+class SizeTConfigParameter(_IntEnum):
+    """See `CUFileSizeTConfigParameter_t`."""
+    PROFILE_STATS = CUFILE_PARAM_PROFILE_STATS
+    EXECUTION_MAX_IO_QUEUE_DEPTH = CUFILE_PARAM_EXECUTION_MAX_IO_QUEUE_DEPTH
+    EXECUTION_MAX_IO_THREADS = CUFILE_PARAM_EXECUTION_MAX_IO_THREADS
+    EXECUTION_MIN_IO_THRESHOLD_SIZE_KB = CUFILE_PARAM_EXECUTION_MIN_IO_THRESHOLD_SIZE_KB
+    EXECUTION_MAX_REQUEST_PARALLELISM = CUFILE_PARAM_EXECUTION_MAX_REQUEST_PARALLELISM
+    PROPERTIES_MAX_DIRECT_IO_SIZE_KB = CUFILE_PARAM_PROPERTIES_MAX_DIRECT_IO_SIZE_KB
+    PROPERTIES_MAX_DEVICE_CACHE_SIZE_KB = CUFILE_PARAM_PROPERTIES_MAX_DEVICE_CACHE_SIZE_KB
+    PROPERTIES_PER_BUFFER_CACHE_SIZE_KB = CUFILE_PARAM_PROPERTIES_PER_BUFFER_CACHE_SIZE_KB
+    PROPERTIES_MAX_DEVICE_PINNED_MEM_SIZE_KB = CUFILE_PARAM_PROPERTIES_MAX_DEVICE_PINNED_MEM_SIZE_KB
+    PROPERTIES_IO_BATCHSIZE = CUFILE_PARAM_PROPERTIES_IO_BATCHSIZE
+    POLLTHRESHOLD_SIZE_KB = CUFILE_PARAM_POLLTHRESHOLD_SIZE_KB
+    PROPERTIES_BATCH_IO_TIMEOUT_MS = CUFILE_PARAM_PROPERTIES_BATCH_IO_TIMEOUT_MS
+
+class BoolConfigParameter(_IntEnum):
+    """See `CUFileBoolConfigParameter_t`."""
+    PROPERTIES_USE_POLL_MODE = CUFILE_PARAM_PROPERTIES_USE_POLL_MODE
+    PROPERTIES_ALLOW_COMPAT_MODE = CUFILE_PARAM_PROPERTIES_ALLOW_COMPAT_MODE
+    FORCE_COMPAT_MODE = CUFILE_PARAM_FORCE_COMPAT_MODE
+    FS_MISC_API_CHECK_AGGRESSIVE = CUFILE_PARAM_FS_MISC_API_CHECK_AGGRESSIVE
+    EXECUTION_PARALLEL_IO = CUFILE_PARAM_EXECUTION_PARALLEL_IO
+    PROFILE_NVTX = CUFILE_PARAM_PROFILE_NVTX
+    PROPERTIES_ALLOW_SYSTEM_MEMORY = CUFILE_PARAM_PROPERTIES_ALLOW_SYSTEM_MEMORY
+    USE_PCIP2PDMA = CUFILE_PARAM_USE_PCIP2PDMA
+    PREFER_IO_URING = CUFILE_PARAM_PREFER_IO_URING
+    FORCE_ODIRECT_MODE = CUFILE_PARAM_FORCE_ODIRECT_MODE
+    SKIP_TOPOLOGY_DETECTION = CUFILE_PARAM_SKIP_TOPOLOGY_DETECTION
+    STREAM_MEMOPS_BYPASS = CUFILE_PARAM_STREAM_MEMOPS_BYPASS
+
+class StringConfigParameter(_IntEnum):
+    """See `CUFileStringConfigParameter_t`."""
+    LOGGING_LEVEL = CUFILE_PARAM_LOGGING_LEVEL
+    ENV_LOGFILE_PATH = CUFILE_PARAM_ENV_LOGFILE_PATH
+    LOG_DIR = CUFILE_PARAM_LOG_DIR
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+ctypedef fused ReturnT:
+    CUfileError_t
+    ssize_t
+
+
+class cuFileError(Exception):
+
+    def __init__(self, status, cu_err=None):
+        self.status = status
+        self.cuda_error = cu_err
+        s = OpError(status)
+        cdef str err = f"{s.name} ({s.value}): {op_status_error(status)}"
+        if cu_err is not None:
+            e = pyCUresult(cu_err)
+            err += f"; CUDA status: {e.name} ({e.value})"
+        super(cuFileError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status, self.cuda_error))
+
+
+@cython.profile(False)
+cdef int check_status(ReturnT status) except 1 nogil:
+    if ReturnT is CUfileError_t:
+        if status.err != 0 or status.cu_err != 0:
+            with gil:
+                raise cuFileError(status.err, status.cu_err)
+    elif ReturnT is ssize_t:
+        if status == -1:
+            # note: this assumes cuFile already properly resets errno in each API
+            with gil:
+                raise cuFileError(errno.errno)
+    return 0
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef intptr_t handle_register(intptr_t descr) except? 0:
+    """cuFileHandleRegister is required, and performs extra checking that is memoized to provide increased performance on later cuFile operations.
+
+    Args:
+        descr (intptr_t): ``CUfileDescr_t`` file descriptor (OS agnostic).
+
+    Returns:
+        intptr_t: ``CUfileHandle_t`` opaque file handle for IO operations.
+
+    .. seealso:: `cuFileHandleRegister`
+    """
+    cdef Handle fh
+    with nogil:
+        status = cuFileHandleRegister(&fh, <CUfileDescr_t*>descr)
+    check_status(status)
+    return <intptr_t>fh
+
+
+cpdef void handle_deregister(intptr_t fh) except*:
+    """releases a registered filehandle from cuFile.
+
+    Args:
+        fh (intptr_t): ``CUfileHandle_t`` file handle.
+
+    .. seealso:: `cuFileHandleDeregister`
+    """
+    cuFileHandleDeregister(<Handle>fh)
+
+
+cpdef buf_register(intptr_t buf_ptr_base, size_t length, int flags):
+    """register an existing cudaMalloced memory with cuFile to pin for GPUDirect Storage access or register host allocated memory with cuFile.
+
+    Args:
+        buf_ptr_base (intptr_t): buffer pointer allocated.
+        length (size_t): size of memory region from the above specified bufPtr.
+        flags (int): CU_FILE_RDMA_REGISTER.
+
+    .. seealso:: `cuFileBufRegister`
+    """
+    with nogil:
+        status = cuFileBufRegister(<const void*>buf_ptr_base, length, flags)
+    check_status(status)
+
+
+cpdef buf_deregister(intptr_t buf_ptr_base):
+    """deregister an already registered device or host memory from cuFile.
+
+    Args:
+        buf_ptr_base (intptr_t): buffer pointer to deregister.
+
+    .. seealso:: `cuFileBufDeregister`
+    """
+    with nogil:
+        status = cuFileBufDeregister(<const void*>buf_ptr_base)
+    check_status(status)
+
+
+cpdef read(intptr_t fh, intptr_t buf_ptr_base, size_t size, off_t file_offset, off_t buf_ptr_offset):
+    """read data from a registered file handle to a specified device or host memory.
+
+    Args:
+        fh (intptr_t): ``CUfileHandle_t`` opaque file handle.
+        buf_ptr_base (intptr_t): base address of buffer in device or host memory.
+        size (size_t): size bytes to read.
+        file_offset (off_t): file-offset from begining of the file.
+        buf_ptr_offset (off_t): offset relative to the buf_ptr_base pointer to read into.
+
+    .. seealso:: `cuFileRead`
+    """
+    with nogil:
+        status = cuFileRead(<Handle>fh, <void*>buf_ptr_base, size, file_offset, buf_ptr_offset)
+    check_status(status)
+
+
+cpdef write(intptr_t fh, intptr_t buf_ptr_base, size_t size, off_t file_offset, off_t buf_ptr_offset):
+    """write data from a specified device or host memory to a registered file handle.
+
+    Args:
+        fh (intptr_t): ``CUfileHandle_t`` opaque file handle.
+        buf_ptr_base (intptr_t): base address of buffer in device or host memory.
+        size (size_t): size bytes to write.
+        file_offset (off_t): file-offset from begining of the file.
+        buf_ptr_offset (off_t): offset relative to the buf_ptr_base pointer to write from.
+
+    .. seealso:: `cuFileWrite`
+    """
+    with nogil:
+        status = cuFileWrite(<Handle>fh, <const void*>buf_ptr_base, size, file_offset, buf_ptr_offset)
+    check_status(status)
+
+
+cpdef driver_open():
+    """Initialize the cuFile library and open the nvidia-fs driver.
+
+    .. seealso:: `cuFileDriverOpen`
+    """
+    with nogil:
+        status = cuFileDriverOpen()
+    check_status(status)
+
+
+cpdef use_count():
+    """returns use count of cufile drivers at that moment by the process.
+
+    .. seealso:: `cuFileUseCount`
+    """
+    with nogil:
+        status = cuFileUseCount()
+    check_status(status)
+
+
+cpdef driver_get_properties(intptr_t props):
+    """Gets the Driver session properties.
+
+    Args:
+        props (intptr_t): to set.
+
+    .. seealso:: `cuFileDriverGetProperties`
+    """
+    with nogil:
+        status = cuFileDriverGetProperties(<CUfileDrvProps_t*>props)
+    check_status(status)
+
+
+cpdef driver_set_poll_mode(bint poll, size_t poll_threshold_size):
+    """Sets whether the Read/Write APIs use polling to do IO operations.
+
+    Args:
+        poll (bint): boolean to indicate whether to use poll mode or not.
+        poll_threshold_size (size_t): max IO size to use for POLLING mode in KB.
+
+    .. seealso:: `cuFileDriverSetPollMode`
+    """
+    with nogil:
+        status = cuFileDriverSetPollMode(<cpp_bool>poll, poll_threshold_size)
+    check_status(status)
+
+
+cpdef driver_set_max_direct_io_size(size_t max_direct_io_size):
+    """Control parameter to set max IO size(KB) used by the library to talk to nvidia-fs driver.
+
+    Args:
+        max_direct_io_size (size_t): maximum allowed direct io size in KB.
+
+    .. seealso:: `cuFileDriverSetMaxDirectIOSize`
+    """
+    with nogil:
+        status = cuFileDriverSetMaxDirectIOSize(max_direct_io_size)
+    check_status(status)
+
+
+cpdef driver_set_max_cache_size(size_t max_cache_size):
+    """Control parameter to set maximum GPU memory reserved per device by the library for internal buffering.
+
+    Args:
+        max_cache_size (size_t): The maximum GPU buffer space per device used for internal use in KB.
+
+    .. seealso:: `cuFileDriverSetMaxCacheSize`
+    """
+    with nogil:
+        status = cuFileDriverSetMaxCacheSize(max_cache_size)
+    check_status(status)
+
+
+cpdef driver_set_max_pinned_mem_size(size_t max_pinned_size):
+    """Sets maximum buffer space that is pinned in KB for use by ``cuFileBufRegister``.
+
+    Args:
+        max_pinned_size (size_t): maximum buffer space that is pinned in KB.
+
+    .. seealso:: `cuFileDriverSetMaxPinnedMemSize`
+    """
+    with nogil:
+        status = cuFileDriverSetMaxPinnedMemSize(max_pinned_size)
+    check_status(status)
+
+
+cpdef intptr_t batch_io_set_up(unsigned nr) except? 0:
+    cdef BatchHandle batch_idp
+    with nogil:
+        status = cuFileBatchIOSetUp(&batch_idp, nr)
+    check_status(status)
+    return <intptr_t>batch_idp
+
+
+cpdef batch_io_submit(intptr_t batch_idp, unsigned nr, intptr_t iocbp, unsigned int flags):
+    with nogil:
+        status = cuFileBatchIOSubmit(<BatchHandle>batch_idp, nr, <CUfileIOParams_t*>iocbp, flags)
+    check_status(status)
+
+
+cpdef batch_io_get_status(intptr_t batch_idp, unsigned min_nr, intptr_t nr, intptr_t iocbp, intptr_t timeout):
+    with nogil:
+        status = cuFileBatchIOGetStatus(<BatchHandle>batch_idp, min_nr, <unsigned*>nr, <CUfileIOEvents_t*>iocbp, <timespec*>timeout)
+    check_status(status)
+
+
+cpdef batch_io_cancel(intptr_t batch_idp):
+    with nogil:
+        status = cuFileBatchIOCancel(<BatchHandle>batch_idp)
+    check_status(status)
+
+
+cpdef void batch_io_destroy(intptr_t batch_idp) except*:
+    cuFileBatchIODestroy(<BatchHandle>batch_idp)
+
+
+cpdef read_async(intptr_t fh, intptr_t buf_ptr_base, intptr_t size_p, intptr_t file_offset_p, intptr_t buf_ptr_offset_p, intptr_t bytes_read_p, intptr_t stream):
+    with nogil:
+        status = cuFileReadAsync(<Handle>fh, <void*>buf_ptr_base, <size_t*>size_p, <off_t*>file_offset_p, <off_t*>buf_ptr_offset_p, <ssize_t*>bytes_read_p, <void*>stream)
+    check_status(status)
+
+
+cpdef write_async(intptr_t fh, intptr_t buf_ptr_base, intptr_t size_p, intptr_t file_offset_p, intptr_t buf_ptr_offset_p, intptr_t bytes_written_p, intptr_t stream):
+    with nogil:
+        status = cuFileWriteAsync(<Handle>fh, <void*>buf_ptr_base, <size_t*>size_p, <off_t*>file_offset_p, <off_t*>buf_ptr_offset_p, <ssize_t*>bytes_written_p, <void*>stream)
+    check_status(status)
+
+
+cpdef stream_register(intptr_t stream, unsigned flags):
+    with nogil:
+        status = cuFileStreamRegister(<void*>stream, flags)
+    check_status(status)
+
+
+cpdef stream_deregister(intptr_t stream):
+    with nogil:
+        status = cuFileStreamDeregister(<void*>stream)
+    check_status(status)
+
+
+cpdef int get_version() except? 0:
+    cdef int version
+    with nogil:
+        status = cuFileGetVersion(&version)
+    check_status(status)
+    return version
+
+
+cpdef size_t get_parameter_size_t(int param) except? 0:
+    cdef size_t value
+    with nogil:
+        status = cuFileGetParameterSizeT(<_SizeTConfigParameter>param, &value)
+    check_status(status)
+    return value
+
+
+cpdef bint get_parameter_bool(int param) except? 0:
+    cdef cpp_bool value
+    with nogil:
+        status = cuFileGetParameterBool(<_BoolConfigParameter>param, &value)
+    check_status(status)
+    return <bint>value
+
+
+cpdef str get_parameter_string(int param, int len):
+    cdef bytes _desc_str_ = bytes(len)
+    cdef char* desc_str = _desc_str_
+    with nogil:
+        status = cuFileGetParameterString(<_StringConfigParameter>param, desc_str, len)
+    check_status(status)
+    return _desc_str_.decode()
+
+
+cpdef set_parameter_size_t(int param, size_t value):
+    with nogil:
+        status = cuFileSetParameterSizeT(<_SizeTConfigParameter>param, value)
+    check_status(status)
+
+
+cpdef set_parameter_bool(int param, bint value):
+    with nogil:
+        status = cuFileSetParameterBool(<_BoolConfigParameter>param, <cpp_bool>value)
+    check_status(status)
+
+
+cpdef set_parameter_string(int param, intptr_t desc_str):
+    with nogil:
+        status = cuFileSetParameterString(<_StringConfigParameter>param, <const char*>desc_str)
+    check_status(status)
+
+
+cpdef str op_status_error(int status):
+    """cufileop status string.
+
+    Args:
+        status (OpError): the error status to query.
+
+    .. seealso:: `cufileop_status_error`
+    """
+    cdef bytes _output_
+    _output_ = cufileop_status_error(<_OpError>status)
+    return _output_.decode()
+
+
+cpdef driver_close():
+    """reset the cuFile library and release the nvidia-fs driver
+    """
+    with nogil:
+        status = cuFileDriverClose_v2()
+    check_status(status)
diff --git a/cuda_bindings/cuda/bindings/cycufile.pxd b/cuda_bindings/cuda/bindings/cycufile.pxd
new file mode 100644
index 000000000..ac19e14e2
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cycufile.pxd
@@ -0,0 +1,256 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 12.9.0. Do not modify it directly.
+
+from libc.time cimport time_t
+from libcpp cimport bool as cpp_bool
+from posix.types cimport off_t
+
+cimport cuda.bindings.cydriver
+from cuda.bindings.cydriver cimport CUresult
+
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+# TODO: switch to "from libc.time cimport timespec" once we can use recent
+# Cython to build
+cdef extern from "<time.h>":
+    cdef struct timespec:
+        time_t tv_sec
+        long   tv_nsec
+cdef extern from "<sys/socket.h>":
+    cdef struct sockaddr:
+        unsigned short sa_family
+        char sa_data[14]
+    ctypedef sockaddr sockaddr_t
+
+
+cdef extern from '<cufile.h>':
+    # enums
+    ctypedef enum CUfileOpError:
+        CU_FILE_SUCCESS
+        CU_FILE_DRIVER_NOT_INITIALIZED
+        CU_FILE_DRIVER_INVALID_PROPS
+        CU_FILE_DRIVER_UNSUPPORTED_LIMIT
+        CU_FILE_DRIVER_VERSION_MISMATCH
+        CU_FILE_DRIVER_VERSION_READ_ERROR
+        CU_FILE_DRIVER_CLOSING
+        CU_FILE_PLATFORM_NOT_SUPPORTED
+        CU_FILE_IO_NOT_SUPPORTED
+        CU_FILE_DEVICE_NOT_SUPPORTED
+        CU_FILE_NVFS_DRIVER_ERROR
+        CU_FILE_CUDA_DRIVER_ERROR
+        CU_FILE_CUDA_POINTER_INVALID
+        CU_FILE_CUDA_MEMORY_TYPE_INVALID
+        CU_FILE_CUDA_POINTER_RANGE_ERROR
+        CU_FILE_CUDA_CONTEXT_MISMATCH
+        CU_FILE_INVALID_MAPPING_SIZE
+        CU_FILE_INVALID_MAPPING_RANGE
+        CU_FILE_INVALID_FILE_TYPE
+        CU_FILE_INVALID_FILE_OPEN_FLAG
+        CU_FILE_DIO_NOT_SET
+        CU_FILE_INVALID_VALUE
+        CU_FILE_MEMORY_ALREADY_REGISTERED
+        CU_FILE_MEMORY_NOT_REGISTERED
+        CU_FILE_PERMISSION_DENIED
+        CU_FILE_DRIVER_ALREADY_OPEN
+        CU_FILE_HANDLE_NOT_REGISTERED
+        CU_FILE_HANDLE_ALREADY_REGISTERED
+        CU_FILE_DEVICE_NOT_FOUND
+        CU_FILE_INTERNAL_ERROR
+        CU_FILE_GETNEWFD_FAILED
+        CU_FILE_NVFS_SETUP_ERROR
+        CU_FILE_IO_DISABLED
+        CU_FILE_BATCH_SUBMIT_FAILED
+        CU_FILE_GPU_MEMORY_PINNING_FAILED
+        CU_FILE_BATCH_FULL
+        CU_FILE_ASYNC_NOT_SUPPORTED
+        CU_FILE_IO_MAX_ERROR
+
+    ctypedef enum CUfileDriverStatusFlags_t:
+        CU_FILE_LUSTRE_SUPPORTED
+        CU_FILE_WEKAFS_SUPPORTED
+        CU_FILE_NFS_SUPPORTED
+        CU_FILE_GPFS_SUPPORTED
+        CU_FILE_NVME_SUPPORTED
+        CU_FILE_NVMEOF_SUPPORTED
+        CU_FILE_SCSI_SUPPORTED
+        CU_FILE_SCALEFLUX_CSD_SUPPORTED
+        CU_FILE_NVMESH_SUPPORTED
+        CU_FILE_BEEGFS_SUPPORTED
+        CU_FILE_NVME_P2P_SUPPORTED
+        CU_FILE_SCATEFS_SUPPORTED
+
+    ctypedef enum CUfileDriverControlFlags_t:
+        CU_FILE_USE_POLL_MODE
+        CU_FILE_ALLOW_COMPAT_MODE
+
+    ctypedef enum CUfileFeatureFlags_t:
+        CU_FILE_DYN_ROUTING_SUPPORTED
+        CU_FILE_BATCH_IO_SUPPORTED
+        CU_FILE_STREAMS_SUPPORTED
+        CU_FILE_PARALLEL_IO_SUPPORTED
+
+    ctypedef enum CUfileFileHandleType:
+        CU_FILE_HANDLE_TYPE_OPAQUE_FD
+        CU_FILE_HANDLE_TYPE_OPAQUE_WIN32
+        CU_FILE_HANDLE_TYPE_USERSPACE_FS
+
+    ctypedef enum CUfileOpcode_t:
+        CUFILE_READ
+        CUFILE_WRITE
+
+    ctypedef enum CUfileStatus_t:
+        CUFILE_WAITING
+        CUFILE_PENDING
+        CUFILE_INVALID
+        CUFILE_CANCELED
+        CUFILE_COMPLETE
+        CUFILE_TIMEOUT
+        CUFILE_FAILED
+
+    ctypedef enum CUfileBatchMode_t:
+        CUFILE_BATCH
+
+    ctypedef enum CUFileSizeTConfigParameter_t:
+        CUFILE_PARAM_PROFILE_STATS
+        CUFILE_PARAM_EXECUTION_MAX_IO_QUEUE_DEPTH
+        CUFILE_PARAM_EXECUTION_MAX_IO_THREADS
+        CUFILE_PARAM_EXECUTION_MIN_IO_THRESHOLD_SIZE_KB
+        CUFILE_PARAM_EXECUTION_MAX_REQUEST_PARALLELISM
+        CUFILE_PARAM_PROPERTIES_MAX_DIRECT_IO_SIZE_KB
+        CUFILE_PARAM_PROPERTIES_MAX_DEVICE_CACHE_SIZE_KB
+        CUFILE_PARAM_PROPERTIES_PER_BUFFER_CACHE_SIZE_KB
+        CUFILE_PARAM_PROPERTIES_MAX_DEVICE_PINNED_MEM_SIZE_KB
+        CUFILE_PARAM_PROPERTIES_IO_BATCHSIZE
+        CUFILE_PARAM_POLLTHRESHOLD_SIZE_KB
+        CUFILE_PARAM_PROPERTIES_BATCH_IO_TIMEOUT_MS
+
+    ctypedef enum CUFileBoolConfigParameter_t:
+        CUFILE_PARAM_PROPERTIES_USE_POLL_MODE
+        CUFILE_PARAM_PROPERTIES_ALLOW_COMPAT_MODE
+        CUFILE_PARAM_FORCE_COMPAT_MODE
+        CUFILE_PARAM_FS_MISC_API_CHECK_AGGRESSIVE
+        CUFILE_PARAM_EXECUTION_PARALLEL_IO
+        CUFILE_PARAM_PROFILE_NVTX
+        CUFILE_PARAM_PROPERTIES_ALLOW_SYSTEM_MEMORY
+        CUFILE_PARAM_USE_PCIP2PDMA
+        CUFILE_PARAM_PREFER_IO_URING
+        CUFILE_PARAM_FORCE_ODIRECT_MODE
+        CUFILE_PARAM_SKIP_TOPOLOGY_DETECTION
+        CUFILE_PARAM_STREAM_MEMOPS_BYPASS
+
+    ctypedef enum CUFileStringConfigParameter_t:
+        CUFILE_PARAM_LOGGING_LEVEL
+        CUFILE_PARAM_ENV_LOGFILE_PATH
+        CUFILE_PARAM_LOG_DIR
+
+    # types
+    ctypedef void* CUfileHandle_t 'CUfileHandle_t'
+    ctypedef void* CUfileBatchHandle_t 'CUfileBatchHandle_t'
+    ctypedef struct CUfileError_t 'CUfileError_t':
+        CUfileOpError err
+        CUresult cu_err
+    cdef struct _anon_pod0 '_anon_pod0':
+        unsigned int major_version
+        unsigned int minor_version
+        size_t poll_thresh_size
+        size_t max_direct_io_size
+        unsigned int dstatusflags
+        unsigned int dcontrolflags
+    ctypedef struct cufileRDMAInfo_t 'cufileRDMAInfo_t':
+        int version
+        int desc_len
+        char* desc_str
+    ctypedef struct CUfileFSOps_t 'CUfileFSOps_t':
+        char* (*fs_type)(void*)
+        int (*getRDMADeviceList)(void*, sockaddr_t**)
+        int (*getRDMADevicePriority)(void*, char*, size_t, loff_t, sockaddr_t*)
+        ssize_t (*read)(void*, char*, size_t, loff_t, cufileRDMAInfo_t*)
+        ssize_t (*write)(void*, const char*, size_t, loff_t, cufileRDMAInfo_t*)
+    cdef union _anon_pod1 '_anon_pod1':
+        int fd
+        void* handle
+    cdef struct _anon_pod3 '_anon_pod3':
+        void* devPtr_base
+        off_t file_offset
+        off_t devPtr_offset
+        size_t size
+    ctypedef struct CUfileIOEvents_t 'CUfileIOEvents_t':
+        void* cookie
+        CUfileStatus_t status
+        size_t ret
+    ctypedef struct CUfileDrvProps_t 'CUfileDrvProps_t':
+        _anon_pod0 nvfs
+        unsigned int fflags
+        unsigned int max_device_cache_size
+        unsigned int per_buffer_cache_size
+        unsigned int max_device_pinned_mem_size
+        unsigned int max_batch_io_size
+        unsigned int max_batch_io_timeout_msecs
+    ctypedef struct CUfileDescr_t 'CUfileDescr_t':
+        CUfileFileHandleType type
+        _anon_pod1 handle
+        CUfileFSOps_t* fs_ops
+    cdef union _anon_pod2 '_anon_pod2':
+        _anon_pod3 batch
+    ctypedef struct CUfileIOParams_t 'CUfileIOParams_t':
+        CUfileBatchMode_t mode
+        _anon_pod2 u
+        CUfileHandle_t fh
+        CUfileOpcode_t opcode
+        void* cookie
+
+
+cdef extern from *:
+    """
+    // This is the missing piece we need to supply to help Cython & C++ compilers.
+    inline bool operator==(const CUfileError_t& lhs, const CUfileError_t& rhs) {
+        return (lhs.err == rhs.err) && (lhs.cu_err == rhs.cu_err);
+    }
+    static CUfileError_t CUFILE_LOADING_ERROR{(CUfileOpError)-1, (CUresult)-1};
+    """
+    const CUfileError_t CUFILE_LOADING_ERROR
+    ctypedef void* CUstream "CUstream"
+
+    const char* cufileop_status_error(CUfileOpError)
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef CUfileError_t cuFileHandleRegister(CUfileHandle_t* fh, CUfileDescr_t* descr) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef void cuFileHandleDeregister(CUfileHandle_t fh) except* nogil
+cdef CUfileError_t cuFileBufRegister(const void* bufPtr_base, size_t length, int flags) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileBufDeregister(const void* bufPtr_base) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef ssize_t cuFileRead(CUfileHandle_t fh, void* bufPtr_base, size_t size, off_t file_offset, off_t bufPtr_offset) except* nogil
+cdef ssize_t cuFileWrite(CUfileHandle_t fh, const void* bufPtr_base, size_t size, off_t file_offset, off_t bufPtr_offset) except* nogil
+cdef CUfileError_t cuFileDriverOpen() except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileDriverClose_v2() except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef long cuFileUseCount() except* nogil
+cdef CUfileError_t cuFileDriverGetProperties(CUfileDrvProps_t* props) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileDriverSetPollMode(cpp_bool poll, size_t poll_threshold_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileDriverSetMaxDirectIOSize(size_t max_direct_io_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileDriverSetMaxCacheSize(size_t max_cache_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileDriverSetMaxPinnedMemSize(size_t max_pinned_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileBatchIOSetUp(CUfileBatchHandle_t* batch_idp, unsigned nr) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileBatchIOSubmit(CUfileBatchHandle_t batch_idp, unsigned nr, CUfileIOParams_t* iocbp, unsigned int flags) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileBatchIOGetStatus(CUfileBatchHandle_t batch_idp, unsigned min_nr, unsigned* nr, CUfileIOEvents_t* iocbp, timespec* timeout) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileBatchIOCancel(CUfileBatchHandle_t batch_idp) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef void cuFileBatchIODestroy(CUfileBatchHandle_t batch_idp) except* nogil
+cdef CUfileError_t cuFileReadAsync(CUfileHandle_t fh, void* bufPtr_base, size_t* size_p, off_t* file_offset_p, off_t* bufPtr_offset_p, ssize_t* bytes_read_p, CUstream stream) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileWriteAsync(CUfileHandle_t fh, void* bufPtr_base, size_t* size_p, off_t* file_offset_p, off_t* bufPtr_offset_p, ssize_t* bytes_written_p, CUstream stream) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileStreamRegister(CUstream stream, unsigned flags) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileStreamDeregister(CUstream stream) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileGetVersion(int* version) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileGetParameterSizeT(CUFileSizeTConfigParameter_t param, size_t* value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileGetParameterBool(CUFileBoolConfigParameter_t param, cpp_bool* value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileGetParameterString(CUFileStringConfigParameter_t param, char* desc_str, int len) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileSetParameterSizeT(CUFileSizeTConfigParameter_t param, size_t value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileSetParameterBool(CUFileBoolConfigParameter_t param, cpp_bool value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
+cdef CUfileError_t cuFileSetParameterString(CUFileStringConfigParameter_t param, const char* desc_str) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil
diff --git a/cuda_bindings/cuda/bindings/cycufile.pyx b/cuda_bindings/cuda/bindings/cycufile.pyx
new file mode 100644
index 000000000..621bd083c
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cycufile.pyx
@@ -0,0 +1,134 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 12.9.0. Do not modify it directly.
+
+from ._internal cimport cufile as _cufile
+
+import cython
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef CUfileError_t cuFileHandleRegister(CUfileHandle_t* fh, CUfileDescr_t* descr) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileHandleRegister(fh, descr)
+
+
+@cython.show_performance_hints(False)
+cdef void cuFileHandleDeregister(CUfileHandle_t fh) except* nogil:
+    _cufile._cuFileHandleDeregister(fh)
+
+
+cdef CUfileError_t cuFileBufRegister(const void* bufPtr_base, size_t length, int flags) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileBufRegister(bufPtr_base, length, flags)
+
+
+cdef CUfileError_t cuFileBufDeregister(const void* bufPtr_base) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileBufDeregister(bufPtr_base)
+
+
+cdef ssize_t cuFileRead(CUfileHandle_t fh, void* bufPtr_base, size_t size, off_t file_offset, off_t bufPtr_offset) except* nogil:
+    return _cufile._cuFileRead(fh, bufPtr_base, size, file_offset, bufPtr_offset)
+
+
+cdef ssize_t cuFileWrite(CUfileHandle_t fh, const void* bufPtr_base, size_t size, off_t file_offset, off_t bufPtr_offset) except* nogil:
+    return _cufile._cuFileWrite(fh, bufPtr_base, size, file_offset, bufPtr_offset)
+
+
+cdef CUfileError_t cuFileDriverOpen() except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileDriverOpen()
+
+
+cdef CUfileError_t cuFileDriverClose_v2() except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileDriverClose_v2()
+
+
+cdef long cuFileUseCount() except* nogil:
+    return _cufile._cuFileUseCount()
+
+
+cdef CUfileError_t cuFileDriverGetProperties(CUfileDrvProps_t* props) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileDriverGetProperties(props)
+
+
+cdef CUfileError_t cuFileDriverSetPollMode(cpp_bool poll, size_t poll_threshold_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileDriverSetPollMode(poll, poll_threshold_size)
+
+
+cdef CUfileError_t cuFileDriverSetMaxDirectIOSize(size_t max_direct_io_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileDriverSetMaxDirectIOSize(max_direct_io_size)
+
+
+cdef CUfileError_t cuFileDriverSetMaxCacheSize(size_t max_cache_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileDriverSetMaxCacheSize(max_cache_size)
+
+
+cdef CUfileError_t cuFileDriverSetMaxPinnedMemSize(size_t max_pinned_size) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileDriverSetMaxPinnedMemSize(max_pinned_size)
+
+
+cdef CUfileError_t cuFileBatchIOSetUp(CUfileBatchHandle_t* batch_idp, unsigned nr) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileBatchIOSetUp(batch_idp, nr)
+
+
+cdef CUfileError_t cuFileBatchIOSubmit(CUfileBatchHandle_t batch_idp, unsigned nr, CUfileIOParams_t* iocbp, unsigned int flags) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileBatchIOSubmit(batch_idp, nr, iocbp, flags)
+
+
+cdef CUfileError_t cuFileBatchIOGetStatus(CUfileBatchHandle_t batch_idp, unsigned min_nr, unsigned* nr, CUfileIOEvents_t* iocbp, timespec* timeout) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileBatchIOGetStatus(batch_idp, min_nr, nr, iocbp, timeout)
+
+
+cdef CUfileError_t cuFileBatchIOCancel(CUfileBatchHandle_t batch_idp) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileBatchIOCancel(batch_idp)
+
+
+@cython.show_performance_hints(False)
+cdef void cuFileBatchIODestroy(CUfileBatchHandle_t batch_idp) except* nogil:
+    _cufile._cuFileBatchIODestroy(batch_idp)
+
+
+cdef CUfileError_t cuFileReadAsync(CUfileHandle_t fh, void* bufPtr_base, size_t* size_p, off_t* file_offset_p, off_t* bufPtr_offset_p, ssize_t* bytes_read_p, CUstream stream) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileReadAsync(fh, bufPtr_base, size_p, file_offset_p, bufPtr_offset_p, bytes_read_p, stream)
+
+
+cdef CUfileError_t cuFileWriteAsync(CUfileHandle_t fh, void* bufPtr_base, size_t* size_p, off_t* file_offset_p, off_t* bufPtr_offset_p, ssize_t* bytes_written_p, CUstream stream) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileWriteAsync(fh, bufPtr_base, size_p, file_offset_p, bufPtr_offset_p, bytes_written_p, stream)
+
+
+cdef CUfileError_t cuFileStreamRegister(CUstream stream, unsigned flags) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileStreamRegister(stream, flags)
+
+
+cdef CUfileError_t cuFileStreamDeregister(CUstream stream) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileStreamDeregister(stream)
+
+
+cdef CUfileError_t cuFileGetVersion(int* version) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileGetVersion(version)
+
+
+cdef CUfileError_t cuFileGetParameterSizeT(CUFileSizeTConfigParameter_t param, size_t* value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileGetParameterSizeT(param, value)
+
+
+cdef CUfileError_t cuFileGetParameterBool(CUFileBoolConfigParameter_t param, cpp_bool* value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileGetParameterBool(param, value)
+
+
+cdef CUfileError_t cuFileGetParameterString(CUFileStringConfigParameter_t param, char* desc_str, int len) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileGetParameterString(param, desc_str, len)
+
+
+cdef CUfileError_t cuFileSetParameterSizeT(CUFileSizeTConfigParameter_t param, size_t value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileSetParameterSizeT(param, value)
+
+
+cdef CUfileError_t cuFileSetParameterBool(CUFileBoolConfigParameter_t param, cpp_bool value) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileSetParameterBool(param, value)
+
+
+cdef CUfileError_t cuFileSetParameterString(CUFileStringConfigParameter_t param, const char* desc_str) except?<CUfileError_t>CUFILE_LOADING_ERROR nogil:
+    return _cufile._cuFileSetParameterString(param, desc_str)
diff --git a/cuda_bindings/docs/source/api.rst b/cuda_bindings/docs/source/api.rst
index 04c010ffd..28a2b8d24 100644
--- a/cuda_bindings/docs/source/api.rst
+++ b/cuda_bindings/docs/source/api.rst
@@ -14,3 +14,4 @@ CUDA Python API Reference
    module/nvrtc
    module/nvjitlink
    module/nvvm
+   module/cufile
diff --git a/cuda_bindings/docs/source/conf.py b/cuda_bindings/docs/source/conf.py
index 313970743..c156cb4cc 100644
--- a/cuda_bindings/docs/source/conf.py
+++ b/cuda_bindings/docs/source/conf.py
@@ -35,6 +35,7 @@
 # ones.
 extensions = [
     "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
     "sphinx.ext.napoleon",
     "sphinx.ext.intersphinx",
     "myst_nb",
@@ -103,6 +104,7 @@
     "numpy": ("https://numpy.org/doc/stable/", None),
     "nvvm": ("https://docs.nvidia.com/cuda/libnvvm-api/", None),
     "nvjitlink": ("https://docs.nvidia.com/cuda/nvjitlink/", None),
+    "cufile": ("https://docs.nvidia.com/gpudirect-storage/api-reference-guide/", None),
 }
 
 suppress_warnings = [
diff --git a/cuda_bindings/docs/source/module/cufile.rst b/cuda_bindings/docs/source/module/cufile.rst
new file mode 100644
index 000000000..86d54f6c2
--- /dev/null
+++ b/cuda_bindings/docs/source/module/cufile.rst
@@ -0,0 +1,76 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+.. default-role:: cpp:any
+.. module:: cuda.bindings.cufile
+
+cufile
+======
+
+The ``cuda.bindings.cufile`` Python module wraps the
+`cuFile C APIs <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html>`_.
+Supported on Linux only.
+
+Currently using this module requires NumPy to be present. Any recent NumPy 1.x or 2.x should work.
+
+
+Functions
+---------
+
+.. autosummary::
+   :toctree: generated/
+
+   handle_register
+   handle_deregister
+   buf_register
+   buf_deregister
+   read
+   write
+   driver_open
+   use_count
+   driver_get_properties
+   driver_set_poll_mode
+   driver_set_max_direct_io_size
+   driver_set_max_cache_size
+   driver_set_max_pinned_mem_size
+   batch_io_set_up
+   batch_io_submit
+   batch_io_get_status
+   batch_io_cancel
+   batch_io_destroy
+   read_async
+   write_async
+   stream_register
+   stream_deregister
+   get_version
+   get_parameter_size_t
+   get_parameter_bool
+   get_parameter_string
+   set_parameter_size_t
+   set_parameter_bool
+   set_parameter_string
+   op_status_error
+   driver_close
+
+
+Types
+-----
+
+.. autosummary::
+   :toctree: generated/
+
+   IOEvents
+   Descr
+   IOParams
+   OpError
+   DriverStatusFlags
+   DriverControlFlags
+   FeatureFlags
+   FileHandleType
+   Opcode
+   Status
+   BatchMode
+   SizeTConfigParameter
+   BoolConfigParameter
+   StringConfigParameter
+   cuFileError
diff --git a/cuda_bindings/docs/source/release/12.X.Y-notes.rst b/cuda_bindings/docs/source/release/12.X.Y-notes.rst
index d2277b383..4ac1f4da6 100644
--- a/cuda_bindings/docs/source/release/12.X.Y-notes.rst
+++ b/cuda_bindings/docs/source/release/12.X.Y-notes.rst
@@ -10,6 +10,12 @@ Released on MM DD, 2025
 Highlights
 ----------
 
+* The ``cuda.bindings.cufile`` Python module was added, wrapping the
+  `cuFile C APIs <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html>`_.
+  Supported on Linux only.
+
+  * Currently using this module requires NumPy to be present. Any recent NumPy 1.x or 2.x should work.
+
 
 Bug fixes
 ---------
diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index fe9debe0e..f1546e299 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -36,6 +36,7 @@ all = [
     "nvidia-cuda-nvcc-cu12",
     "nvidia-cuda-nvrtc-cu12",
     "nvidia-nvjitlink-cu12>=12.3",
+    "nvidia-cufile-cu12; sys_platform == 'linux'",
 ]
 
 test = [
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index f04ca6977..50f573a37 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -327,6 +327,10 @@ def do_cythonize(extensions):
 
 
 static_runtime_libraries = ["cudart_static", "rt"] if sys.platform == "linux" else ["cudart_static"]
+cuda_bindings_files = glob.glob("cuda/bindings/*.pyx")
+if sys.platform == "win32":
+    # cuFILE does not support Windows
+    cuda_bindings_files = [f for f in cuda_bindings_files if "cufile" not in f]
 sources_list = [
     # private
     (["cuda/bindings/_bindings/cydriver.pyx", "cuda/bindings/_bindings/loader.cpp"], None),
@@ -338,13 +342,12 @@ def do_cythonize(extensions):
     (["cuda/bindings/_lib/cyruntime/cyruntime.pyx"], None),
     (["cuda/bindings/_lib/cyruntime/utils.pyx"], None),
     # public
-    (["cuda/bindings/*.pyx"], None),
+    *(([f], None) for f in cuda_bindings_files),
     # public (deprecated, to be removed)
     (["cuda/*.pyx"], None),
     # internal files used by generated bindings
-    (["cuda/bindings/_internal/nvjitlink.pyx"], None),
-    (["cuda/bindings/_internal/nvvm.pyx"], None),
     (["cuda/bindings/_internal/utils.pyx"], None),
+    *(([f], None) for f in dst_files if f.endswith(".pyx")),
 ]
 
 for sources, libraries in sources_list:
diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py
new file mode 100644
index 000000000..4962a7fed
--- /dev/null
+++ b/cuda_bindings/tests/test_cufile.py
@@ -0,0 +1,1840 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import ctypes
+import errno
+import os
+import tempfile
+from contextlib import suppress
+
+import pytest
+
+import cuda.bindings.driver as cuda
+
+try:
+    from cuda.bindings import cufile
+except ImportError:
+    cufile = None
+
+
+if cufile is None:
+    pytest.skip("skipping tests on Windows", allow_module_level=True)
+
+
+def cufileLibraryAvailable():
+    """Check if cuFile library is available on the system."""
+    try:
+        # Try to get cuFile library version - this will fail if library is not available
+        version = cufile.get_version()
+        print(f"cuFile library available, version: {version}")
+        return True
+    except Exception as e:
+        print(f"cuFile library not available: {e}")
+        return False
+
+
+def cufileVersionLessThan(target):
+    """Check if cuFile library version is less than target version."""
+    try:
+        # Get cuFile library version
+        version = cufile.get_version()
+        print(f"cuFile library version: {version}")
+        # Check if version is less than target
+        if version < target:
+            print(f"cuFile library version {version} is less than required {target}")
+            return True
+        return False
+    except Exception as e:
+        print(f"Error checking cuFile version: {e}")
+        return True  # Assume old version if any error occurs
+
+
+def isSupportedFilesystem():
+    """Check if the current filesystem is supported (ext4 or xfs)."""
+    try:
+        # Try to get filesystem type from /proc/mounts
+        with open("/proc/mounts") as f:
+            for line in f:
+                parts = line.split()
+                if len(parts) >= 2:
+                    mount_point = parts[1]
+                    fs_type = parts[2]
+
+                    # Check if current directory is under this mount point
+                    current_dir = os.path.abspath(".")
+                    if current_dir.startswith(mount_point):
+                        fs_type_lower = fs_type.lower()
+                        print(f"Current filesystem type: {fs_type_lower}")
+                        return fs_type_lower in ["ext4", "xfs"]
+
+        # If we get here, we couldn't determine the filesystem type
+        print("Could not determine filesystem type from /proc/mounts")
+        return False
+    except Exception as e:
+        print(f"Error checking filesystem type: {e}")
+        return False
+
+
+# Global skip condition for all tests if cuFile library is not available
+pytestmark = pytest.mark.skipif(not cufileLibraryAvailable(), reason="cuFile library not available on this system")
+
+
+def safe_decode_string(raw_value):
+    """Safely decode a string value from ctypes buffer."""
+    # Find null terminator if present
+    null_pos = raw_value.find(b"\x00")
+    if null_pos != -1:
+        raw_value = raw_value[:null_pos]
+    # Decode with error handling
+    try:
+        return raw_value.decode("utf-8", errors="ignore")
+    except UnicodeDecodeError:
+        # If UTF-8 fails, try to decode as bytes
+        return str(raw_value)
+
+
+def test_cufile_success_defined():
+    """Check if CUFILE_SUCCESS is defined in OpError enum."""
+    assert hasattr(cufile.OpError, "SUCCESS")
+
+
+def test_driver_open():
+    """Test cuFile driver initialization."""
+    cufile.driver_open()
+    cufile.driver_close()
+
+
+@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
+def test_handle_register():
+    """Test file handle registration with cuFile."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Create test file
+    file_path = "test_handle_register.bin"
+
+    # Create file with POSIX operations
+    fd = os.open(file_path, os.O_CREAT | os.O_RDWR, 0o644)
+
+    # Write test data using POSIX write
+    test_data = b"Test data for cuFile - POSIX write"
+    bytes_written = os.write(fd, test_data)
+
+    # Sync to ensure data is on disk
+    os.fsync(fd)
+
+    # Close and reopen with O_DIRECT for cuFile operations
+    os.close(fd)
+
+    # Reopen with O_DIRECT
+    flags = os.O_RDWR | os.O_DIRECT
+    fd = os.open(file_path, flags)
+
+    try:
+        # Create and initialize the descriptor
+        descr = cufile.Descr()
+        descr.type = cufile.FileHandleType.OPAQUE_FD
+        descr.handle.fd = fd
+        descr.fs_ops = 0
+
+        # Register the handle
+        handle = cufile.handle_register(descr.ptr)
+
+        # Deregister the handle
+        cufile.handle_deregister(handle)
+
+    finally:
+        os.close(fd)
+        with suppress(OSError):
+            os.unlink(file_path)
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+def test_buf_register_simple():
+    """Simple test for buffer registration with cuFile."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Allocate CUDA memory
+    buffer_size = 4096  # 4KB, aligned to 4096 bytes
+    err, buf_ptr = cuda.cuMemAlloc(buffer_size)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    try:
+        # Register the buffer with cuFile
+        flags = 0
+        buf_ptr_int = int(buf_ptr)
+        cufile.buf_register(buf_ptr_int, buffer_size, flags)
+
+        # Deregister the buffer
+        cufile.buf_deregister(buf_ptr_int)
+
+    finally:
+        # Free CUDA memory
+        cuda.cuMemFree(buf_ptr)
+
+        # Close cuFile driver
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+def test_buf_register_host_memory():
+    """Test buffer registration with host memory."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Allocate host memory
+    buffer_size = 4096  # 4KB, aligned to 4096 bytes
+    err, buf_ptr = cuda.cuMemHostAlloc(buffer_size, 0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    try:
+        # Register the host buffer with cuFile
+        flags = 0
+        buf_ptr_int = int(buf_ptr)
+        cufile.buf_register(buf_ptr_int, buffer_size, flags)
+
+        # Deregister the buffer
+        cufile.buf_deregister(buf_ptr_int)
+
+    finally:
+        # Free host memory
+        cuda.cuMemFreeHost(buf_ptr)
+
+        # Close cuFile driver
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+def test_buf_register_multiple_buffers():
+    """Test registering multiple buffers."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Allocate multiple CUDA buffers
+    buffer_sizes = [4096, 16384, 65536]  # All aligned to 4096 bytes
+    buffers = []
+
+    for size in buffer_sizes:
+        err, buf_ptr = cuda.cuMemAlloc(size)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+        buffers.append(buf_ptr)
+
+    try:
+        # Register all buffers
+        flags = 0
+        for buf_ptr, size in zip(buffers, buffer_sizes):
+            buf_ptr_int = int(buf_ptr)
+            cufile.buf_register(buf_ptr_int, size, flags)
+
+        # Deregister all buffers
+        for buf_ptr in buffers:
+            buf_ptr_int = int(buf_ptr)
+            cufile.buf_deregister(buf_ptr_int)
+
+    finally:
+        # Free all buffers
+        for buf_ptr in buffers:
+            cuda.cuMemFree(buf_ptr)
+
+        # Close cuFile driver
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+def test_buf_register_invalid_flags():
+    """Test buffer registration with invalid flags."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Allocate CUDA memory
+    buffer_size = 65536
+    err, buf_ptr = cuda.cuMemAlloc(buffer_size)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    try:
+        # Try to register with invalid flags
+        invalid_flags = 999
+        buf_ptr_int = int(buf_ptr)
+
+        with suppress(Exception):
+            cufile.buf_register(buf_ptr_int, buffer_size, invalid_flags)
+            # If we get here, deregister to clean up
+            cufile.buf_deregister(buf_ptr_int)
+
+    finally:
+        # Free CUDA memory
+        cuda.cuMemFree(buf_ptr)
+
+        # Close cuFile driver
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+def test_buf_register_large_buffer():
+    """Test buffer registration with a large buffer."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Allocate large CUDA memory (1MB, aligned to 4096 bytes)
+    buffer_size = 1024 * 1024  # 1MB, aligned to 4096 bytes (1048576 % 4096 == 0)
+    err, buf_ptr = cuda.cuMemAlloc(buffer_size)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    try:
+        # Register the large buffer with cuFile
+        flags = 0
+        buf_ptr_int = int(buf_ptr)
+        cufile.buf_register(buf_ptr_int, buffer_size, flags)
+
+        # Deregister the buffer
+        cufile.buf_deregister(buf_ptr_int)
+
+    finally:
+        # Free CUDA memory
+        cuda.cuMemFree(buf_ptr)
+        # Close cuFile driver
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+def test_buf_register_already_registered():
+    """Test that registering an already registered buffer fails."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Allocate CUDA memory
+    buffer_size = 4096  # 4KB, aligned to 4096 bytes
+    err, buf_ptr = cuda.cuMemAlloc(buffer_size)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    try:
+        # Register the buffer first time
+        flags = 0
+        buf_ptr_int = int(buf_ptr)
+        cufile.buf_register(buf_ptr_int, buffer_size, flags)
+
+        # Try to register the same buffer again
+        try:
+            cufile.buf_register(buf_ptr_int, buffer_size, flags)
+            # If we get here, deregister both times
+            cufile.buf_deregister(buf_ptr_int)
+            cufile.buf_deregister(buf_ptr_int)
+        except Exception:
+            # Expected error when registering buffer twice
+            # Deregister the first registration
+            cufile.buf_deregister(buf_ptr_int)
+
+    finally:
+        # Free CUDA memory
+        cuda.cuMemFree(buf_ptr)
+        # Close cuFile driver
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
+def test_cufile_read_write():
+    """Test cuFile read and write operations."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Create test file
+    file_path = "test_cufile_rw.bin"
+
+    # Allocate CUDA memory for write and read
+    write_size = 65536  # 64KB, aligned to 4096 bytes (65536 % 4096 == 0)
+    err, write_buf = cuda.cuMemAlloc(write_size)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, read_buf = cuda.cuMemAlloc(write_size)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Allocate host memory for data verification
+    host_buf = ctypes.create_string_buffer(write_size)
+
+    try:
+        # Create file with O_DIRECT
+        fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o644)
+
+        # Register buffers with cuFile
+        write_buf_int = int(write_buf)
+        read_buf_int = int(read_buf)
+
+        cufile.buf_register(write_buf_int, write_size, 0)
+        cufile.buf_register(read_buf_int, write_size, 0)
+
+        # Create file descriptor
+        descr = cufile.Descr()
+        descr.type = cufile.FileHandleType.OPAQUE_FD
+        descr.handle.fd = fd
+        descr.fs_ops = 0
+
+        # Register file handle
+        handle = cufile.handle_register(descr.ptr)
+
+        # Prepare test data
+        test_string = b"Hello cuFile! This is test data for read/write operations. "
+        test_string_len = len(test_string)
+        repetitions = write_size // test_string_len
+        test_data = test_string * repetitions
+        test_data = test_data[:write_size]  # Ensure it fits exactly in buffer
+        host_buf = ctypes.create_string_buffer(test_data, write_size)
+
+        # Copy test data to CUDA write buffer
+        cuda.cuMemcpyHtoDAsync(write_buf, host_buf, write_size, 0)
+        cuda.cuStreamSynchronize(0)
+
+        # Write data using cuFile
+        bytes_written = cufile.write(handle, write_buf_int, write_size, 0, 0)
+
+        # Read data back using cuFile
+        bytes_read = cufile.read(handle, read_buf_int, write_size, 0, 0)
+
+        # Copy read data back to host
+        cuda.cuMemcpyDtoHAsync(host_buf, read_buf, write_size, 0)
+        cuda.cuStreamSynchronize(0)
+
+        # Verify the data
+        read_data = host_buf.value
+        assert read_data == test_data, "Read data doesn't match written data"
+
+        # Deregister file handle
+        cufile.handle_deregister(handle)
+
+        # Deregister buffers
+        cufile.buf_deregister(write_buf_int)
+        cufile.buf_deregister(read_buf_int)
+
+    finally:
+        # Close file
+        os.close(fd)
+        # Free CUDA memory
+        cuda.cuMemFree(write_buf)
+        cuda.cuMemFree(read_buf)
+        # Clean up test file
+        try:
+            os.unlink(file_path)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+        # Close cuFile driver
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
+def test_cufile_read_write_host_memory():
+    """Test cuFile read and write operations using host memory."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Create test file
+    file_path = "test_cufile_rw_host.bin"
+
+    # Allocate host memory for write and read
+    write_size = 65536  # 64KB, aligned to 4096 bytes (65536 % 4096 == 0)
+    err, write_buf = cuda.cuMemHostAlloc(write_size, 0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, read_buf = cuda.cuMemHostAlloc(write_size, 0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    try:
+        # Create file with O_DIRECT
+        fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o644)
+
+        # Register host buffers with cuFile
+        write_buf_int = int(write_buf)
+        read_buf_int = int(read_buf)
+
+        cufile.buf_register(write_buf_int, write_size, 0)
+        cufile.buf_register(read_buf_int, write_size, 0)
+
+        # Create file descriptor
+        descr = cufile.Descr()
+        descr.type = cufile.FileHandleType.OPAQUE_FD
+        descr.handle.fd = fd
+        descr.fs_ops = 0
+
+        # Register file handle
+        handle = cufile.handle_register(descr.ptr)
+
+        # Prepare test data
+        test_string = b"Host memory test data for cuFile operations! "
+        test_string_len = len(test_string)
+        repetitions = write_size // test_string_len
+        test_data = test_string * repetitions
+        test_data = test_data[:write_size]  # Ensure it fits exactly in buffer
+
+        # Copy test data to host write buffer
+        host_buf = ctypes.create_string_buffer(test_data, write_size)
+        write_buf_content = ctypes.string_at(write_buf, write_size)
+
+        # Write data using cuFile
+        bytes_written = cufile.write(handle, write_buf_int, write_size, 0, 0)
+
+        # Sync to ensure data is on disk
+        os.fsync(fd)
+
+        # Read data back using cuFile
+        bytes_read = cufile.read(handle, read_buf_int, write_size, 0, 0)
+
+        # Verify the data
+        read_data = ctypes.string_at(read_buf, write_size)
+        expected_data = write_buf_content
+        assert read_data == expected_data, "Read data doesn't match written data"
+
+        # Deregister file handle
+        cufile.handle_deregister(handle)
+
+        # Deregister buffers
+        cufile.buf_deregister(write_buf_int)
+        cufile.buf_deregister(read_buf_int)
+
+    finally:
+        # Close file
+        os.close(fd)
+        # Free host memory
+        cuda.cuMemFreeHost(write_buf)
+        cuda.cuMemFreeHost(read_buf)
+        # Clean up test file
+        try:
+            os.unlink(file_path)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+        # Close cuFile driver
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
+def test_cufile_read_write_large():
+    """Test cuFile read and write operations with large data."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Create test file
+    file_path = "test_cufile_rw_large.bin"
+
+    # Allocate large CUDA memory (1MB, aligned to 4096 bytes)
+    write_size = 1024 * 1024  # 1MB, aligned to 4096 bytes (1048576 % 4096 == 0)
+    err, write_buf = cuda.cuMemAlloc(write_size)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, read_buf = cuda.cuMemAlloc(write_size)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Allocate host memory for data verification
+    host_buf = ctypes.create_string_buffer(write_size)
+
+    try:
+        # Create file with O_DIRECT
+        fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o644)
+
+        # Register buffers with cuFile
+        write_buf_int = int(write_buf)
+        read_buf_int = int(read_buf)
+
+        cufile.buf_register(write_buf_int, write_size, 0)
+        cufile.buf_register(read_buf_int, write_size, 0)
+
+        # Create file descriptor
+        descr = cufile.Descr()
+        descr.type = cufile.FileHandleType.OPAQUE_FD
+        descr.handle.fd = fd
+        descr.fs_ops = 0
+
+        # Register file handle
+        handle = cufile.handle_register(descr.ptr)
+
+        # Generate large test data
+        import random
+
+        test_data = bytes(random.getrandbits(8) for _ in range(write_size))
+        host_buf = ctypes.create_string_buffer(test_data, write_size)
+
+        # Copy test data to CUDA write buffer
+        cuda.cuMemcpyHtoDAsync(write_buf, host_buf, write_size, 0)
+        cuda.cuStreamSynchronize(0)
+
+        # Get the actual data that was written to CUDA buffer
+        cuda.cuMemcpyDtoHAsync(host_buf, write_buf, write_size, 0)
+        cuda.cuStreamSynchronize(0)
+        expected_data = host_buf.value
+
+        # Write data using cuFile
+        bytes_written = cufile.write(handle, write_buf_int, write_size, 0, 0)
+
+        # Read data back using cuFile
+        bytes_read = cufile.read(handle, read_buf_int, write_size, 0, 0)
+
+        # Copy read data back to host
+        cuda.cuMemcpyDtoHAsync(host_buf, read_buf, write_size, 0)
+        cuda.cuStreamSynchronize(0)
+
+        # Verify the data
+        read_data = host_buf.value
+        assert read_data == expected_data, "Large read data doesn't match written data"
+
+        # Deregister file handle
+        cufile.handle_deregister(handle)
+
+        # Deregister buffers
+        cufile.buf_deregister(write_buf_int)
+        cufile.buf_deregister(read_buf_int)
+
+    finally:
+        # Close file
+        os.close(fd)
+        # Free CUDA memory
+        cuda.cuMemFree(write_buf)
+        cuda.cuMemFree(read_buf)
+        # Clean up test file
+        try:
+            os.unlink(file_path)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+        # Close cuFile driver
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
+def test_cufile_write_async():
+    """Test cuFile asynchronous write operations."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Create test file
+    file_path = "test_cufile_write_async.bin"
+    fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o644)
+
+    try:
+        # Register file handle
+        descr = cufile.Descr()
+        descr.type = cufile.FileHandleType.OPAQUE_FD
+        descr.handle.fd = fd
+        descr.fs_ops = 0
+        handle = cufile.handle_register(descr.ptr)
+
+        # Allocate and register device buffer
+        buf_size = 65536  # 64KB, aligned to 4096 bytes (65536 % 4096 == 0)
+        err, buf_ptr = cuda.cuMemAlloc(buf_size)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+        cufile.buf_register(int(buf_ptr), buf_size, 0)
+
+        # Create CUDA stream
+        err, stream = cuda.cuStreamCreate(0)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+
+        # Register stream with cuFile
+        cufile.stream_register(int(stream), 0)
+
+        # Prepare test data in device buffer
+        test_string = b"Async write test data for cuFile!"
+        test_string_len = len(test_string)
+        repetitions = buf_size // test_string_len
+        test_data = test_string * repetitions
+        test_data = test_data[:buf_size]  # Ensure it fits exactly in buffer
+        host_buf = ctypes.create_string_buffer(test_data, buf_size)
+        cuda.cuMemcpyHtoDAsync(buf_ptr, host_buf, buf_size, 0)
+        cuda.cuStreamSynchronize(0)
+
+        # Create parameter arrays for async write
+        size_p = ctypes.c_size_t(buf_size)
+        file_offset_p = ctypes.c_int64(0)
+        buf_ptr_offset_p = ctypes.c_int64(0)
+        bytes_written_p = ctypes.c_ssize_t(0)
+
+        # Perform async write
+        cufile.write_async(
+            int(handle),
+            int(buf_ptr),
+            ctypes.addressof(size_p),
+            ctypes.addressof(file_offset_p),
+            ctypes.addressof(buf_ptr_offset_p),
+            ctypes.addressof(bytes_written_p),
+            int(stream),
+        )
+
+        # Synchronize stream to wait for completion
+        cuda.cuStreamSynchronize(stream)
+
+        # Verify bytes written
+        assert bytes_written_p.value == buf_size, f"Expected {buf_size} bytes written, got {bytes_written_p.value}"
+
+        # Deregister stream
+        cufile.stream_deregister(int(stream))
+
+        # Deregister and cleanup
+        cufile.buf_deregister(int(buf_ptr))
+        cufile.handle_deregister(handle)
+        cuda.cuStreamDestroy(stream)
+        cuda.cuMemFree(buf_ptr)
+
+    finally:
+        os.close(fd)
+        with suppress(OSError):
+            os.unlink(file_path)
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
+def test_cufile_read_async():
+    """Test cuFile asynchronous read operations."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Create test file
+    file_path = "test_cufile_read_async.bin"
+
+    # First create and write test data without O_DIRECT
+    fd_temp = os.open(file_path, os.O_CREAT | os.O_RDWR, 0o644)
+    # Create test data that's aligned to 4096 bytes
+    test_string = b"Async read test data for cuFile!"
+    test_string_len = len(test_string)
+    buf_size = 65536  # 64KB, aligned to 4096 bytes
+    repetitions = buf_size // test_string_len
+    test_data = test_string * repetitions
+    test_data = test_data[:buf_size]  # Ensure exact 64KB
+    os.write(fd_temp, test_data)
+    os.fsync(fd_temp)
+    os.close(fd_temp)
+
+    # Now open with O_DIRECT for cuFile operations
+    fd = os.open(file_path, os.O_RDWR | os.O_DIRECT)
+
+    try:
+        # Register file handle
+        descr = cufile.Descr()
+        descr.type = cufile.FileHandleType.OPAQUE_FD
+        descr.handle.fd = fd
+        descr.fs_ops = 0
+        handle = cufile.handle_register(descr.ptr)
+
+        # Allocate and register device buffer
+        buf_size = 65536  # 64KB, aligned to 4096 bytes (65536 % 4096 == 0)
+        err, buf_ptr = cuda.cuMemAlloc(buf_size)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+        cufile.buf_register(int(buf_ptr), buf_size, 0)
+
+        # Create CUDA stream
+        err, stream = cuda.cuStreamCreate(0)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+
+        # Register stream with cuFile
+        cufile.stream_register(int(stream), 0)
+
+        # Create parameter arrays for async read
+        size_p = ctypes.c_size_t(buf_size)
+        file_offset_p = ctypes.c_int64(0)
+        buf_ptr_offset_p = ctypes.c_int64(0)
+        bytes_read_p = ctypes.c_ssize_t(0)
+
+        # Perform async read
+        cufile.read_async(
+            int(handle),
+            int(buf_ptr),
+            ctypes.addressof(size_p),
+            ctypes.addressof(file_offset_p),
+            ctypes.addressof(buf_ptr_offset_p),
+            ctypes.addressof(bytes_read_p),
+            int(stream),
+        )
+
+        # Synchronize stream to wait for completion
+        cuda.cuStreamSynchronize(stream)
+
+        # Verify bytes read
+        assert bytes_read_p.value > 0, f"Expected bytes read, got {bytes_read_p.value}"
+
+        # Copy read data back to host and verify
+        host_buf = ctypes.create_string_buffer(buf_size)
+        cuda.cuMemcpyDtoHAsync(host_buf, buf_ptr, buf_size, 0)
+        cuda.cuStreamSynchronize(0)
+        read_data = host_buf.value[: bytes_read_p.value]
+        expected_data = test_data[: bytes_read_p.value]
+        assert read_data == expected_data, "Read data doesn't match written data"
+
+        # Deregister stream
+        cufile.stream_deregister(int(stream))
+
+        # Deregister and cleanup
+        cufile.buf_deregister(int(buf_ptr))
+        cufile.handle_deregister(handle)
+        cuda.cuStreamDestroy(stream)
+        cuda.cuMemFree(buf_ptr)
+
+    finally:
+        os.close(fd)
+        with suppress(OSError):
+            os.unlink(file_path)
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
+def test_cufile_async_read_write():
+    """Test cuFile asynchronous read and write operations in sequence."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Create test file
+    file_path = "test_cufile_async_rw.bin"
+    fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o644)
+
+    try:
+        # Register file handle
+        descr = cufile.Descr()
+        descr.type = cufile.FileHandleType.OPAQUE_FD
+        descr.handle.fd = fd
+        descr.fs_ops = 0
+        handle = cufile.handle_register(descr.ptr)
+
+        # Allocate and register device buffers
+        buf_size = 65536  # 64KB, aligned to 4096 bytes (65536 % 4096 == 0)
+        err, write_buf = cuda.cuMemAlloc(buf_size)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+        cufile.buf_register(int(write_buf), buf_size, 0)
+
+        err, read_buf = cuda.cuMemAlloc(buf_size)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+        cufile.buf_register(int(read_buf), buf_size, 0)
+
+        # Create CUDA stream
+        err, stream = cuda.cuStreamCreate(0)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+
+        # Register stream with cuFile
+        cufile.stream_register(int(stream), 0)
+
+        # Prepare test data in write buffer
+        test_string = b"Async RW test data for cuFile!"
+        test_string_len = len(test_string)
+        repetitions = buf_size // test_string_len
+        test_data = test_string * repetitions
+        test_data = test_data[:buf_size]  # Ensure it fits exactly in buffer
+        host_buf = ctypes.create_string_buffer(test_data, buf_size)
+        cuda.cuMemcpyHtoDAsync(write_buf, host_buf, buf_size, 0)
+        cuda.cuStreamSynchronize(0)
+
+        # Create parameter arrays for async write
+        write_size_p = ctypes.c_size_t(buf_size)
+        write_file_offset_p = ctypes.c_int64(0)
+        write_buf_ptr_offset_p = ctypes.c_int64(0)
+        bytes_written_p = ctypes.c_ssize_t(0)
+
+        # Perform async write
+        cufile.write_async(
+            int(handle),
+            int(write_buf),
+            ctypes.addressof(write_size_p),
+            ctypes.addressof(write_file_offset_p),
+            ctypes.addressof(write_buf_ptr_offset_p),
+            ctypes.addressof(bytes_written_p),
+            int(stream),
+        )
+
+        # Synchronize stream to wait for write completion
+        cuda.cuStreamSynchronize(stream)
+
+        # Verify bytes written
+        assert bytes_written_p.value == buf_size, f"Expected {buf_size} bytes written, got {bytes_written_p.value}"
+
+        # Create parameter arrays for async read
+        read_size_p = ctypes.c_size_t(buf_size)
+        read_file_offset_p = ctypes.c_int64(0)
+        read_buf_ptr_offset_p = ctypes.c_int64(0)
+        bytes_read_p = ctypes.c_ssize_t(0)
+
+        # Perform async read
+        cufile.read_async(
+            int(handle),
+            int(read_buf),
+            ctypes.addressof(read_size_p),
+            ctypes.addressof(read_file_offset_p),
+            ctypes.addressof(read_buf_ptr_offset_p),
+            ctypes.addressof(bytes_read_p),
+            int(stream),
+        )
+
+        # Synchronize stream to wait for read completion
+        cuda.cuStreamSynchronize(stream)
+
+        # Verify bytes read
+        assert bytes_read_p.value == buf_size, f"Expected {buf_size} bytes read, got {bytes_read_p.value}"
+
+        # Copy read data back to host and verify
+        host_buf = ctypes.create_string_buffer(buf_size)
+        cuda.cuMemcpyDtoHAsync(host_buf, read_buf, buf_size, 0)
+        cuda.cuStreamSynchronize(0)
+        read_data = host_buf.value
+        assert read_data == test_data, "Read data doesn't match written data"
+
+        # Deregister stream
+        cufile.stream_deregister(int(stream))
+
+        # Deregister and cleanup
+        cufile.buf_deregister(int(write_buf))
+        cufile.buf_deregister(int(read_buf))
+        cufile.handle_deregister(handle)
+        cuda.cuStreamDestroy(stream)
+        cuda.cuMemFree(write_buf)
+        cuda.cuMemFree(read_buf)
+
+    finally:
+        os.close(fd)
+        with suppress(OSError):
+            os.unlink(file_path)
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
+def test_batch_io_basic():
+    """Test basic batch IO operations with multiple read/write operations."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Create test file
+    file_path = "test_batch_io.bin"
+
+    # Allocate CUDA memory for multiple operations
+    buf_size = 65536  # 64KB
+    num_operations = 4
+
+    buffers = []
+    read_buffers = []  # Initialize read_buffers to avoid UnboundLocalError
+
+    for i in range(num_operations):
+        err, buf = cuda.cuMemAlloc(buf_size)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+        buffers.append(buf)
+
+    # Allocate host memory for data verification
+    host_buf = ctypes.create_string_buffer(buf_size)
+
+    try:
+        # Create file with O_DIRECT
+        fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o644)
+
+        # Register buffers with cuFile
+        for buf in buffers:
+            buf_int = int(buf)
+            cufile.buf_register(buf_int, buf_size, 0)
+
+        # Create file descriptor
+        descr = cufile.Descr()
+        descr.type = cufile.FileHandleType.OPAQUE_FD
+        descr.handle.fd = fd
+        descr.fs_ops = 0
+
+        # Register file handle
+        handle = cufile.handle_register(descr.ptr)
+
+        # Set up batch IO
+        batch_handle = cufile.batch_io_set_up(num_operations)
+
+        # Create IOParams array for batch operations
+        io_params = cufile.IOParams(num_operations)
+        io_events = cufile.IOEvents(num_operations)
+
+        # Prepare test data for each operation
+        test_strings = [
+            b"Batch operation 1 data for testing cuFile! ",
+            b"Batch operation 2 data for testing cuFile! ",
+            b"Batch operation 3 data for testing cuFile! ",
+            b"Batch operation 4 data for testing cuFile! ",
+        ]
+
+        # Set up write operations
+        for i in range(num_operations):
+            # Prepare test data
+            test_string = test_strings[i]
+            test_string_len = len(test_string)
+            repetitions = buf_size // test_string_len
+            test_data = test_string * repetitions
+            test_data = test_data[:buf_size]  # Ensure it fits exactly in buffer
+            host_buf = ctypes.create_string_buffer(test_data, buf_size)
+
+            # Copy test data to CUDA buffer
+            cuda.cuMemcpyHtoDAsync(buffers[i], host_buf, buf_size, 0)
+            cuda.cuStreamSynchronize(0)
+
+            # Set up IOParams for this operation
+            io_params[i].mode = cufile.BatchMode.BATCH  # Batch mode
+            io_params[i].fh = handle
+            io_params[i].opcode = cufile.Opcode.WRITE  # Write opcode
+            io_params[i].cookie = i  # Use index as cookie for identification
+            io_params[i].u.batch.dev_ptr_base = int(buffers[i])
+            io_params[i].u.batch.file_offset = i * buf_size  # Sequential file offsets
+            io_params[i].u.batch.dev_ptr_offset = 0
+            io_params[i].u.batch.size_ = buf_size
+
+        # Submit batch write operations
+        cufile.batch_io_submit(batch_handle, num_operations, io_params.ptr, 0)
+
+        # Get batch status
+        min_nr = num_operations  # Wait for all operations to complete
+        nr_completed = ctypes.c_uint(num_operations)  # Initialize to max operations posted
+        timeout = ctypes.c_int(5000)  # 5 second timeout
+
+        cufile.batch_io_get_status(
+            batch_handle, min_nr, ctypes.addressof(nr_completed), io_events.ptr, ctypes.addressof(timeout)
+        )
+
+        # Verify all operations completed successfully
+        assert nr_completed.value == num_operations, f"Expected {num_operations} operations, got {nr_completed.value}"
+
+        # Collect all returned cookies
+        returned_cookies = set()
+        for i in range(num_operations):
+            assert io_events[i].status == cufile.Status.COMPLETE, (
+                f"Operation {i} failed with status {io_events[i].status}"
+            )
+            assert io_events[i].ret == buf_size, f"Expected {buf_size} bytes, got {io_events[i].ret} for operation {i}"
+            returned_cookies.add(io_events[i].cookie)
+
+        # Verify all expected cookies are present
+        expected_cookies = set(range(num_operations))  # cookies 0, 1, 2, 3
+        assert returned_cookies == expected_cookies, (
+            f"Cookie mismatch. Expected {expected_cookies}, got {returned_cookies}"
+        )
+
+        # Now test batch read operations
+        read_buffers = []
+        for i in range(num_operations):
+            err, buf = cuda.cuMemAlloc(buf_size)
+            assert err == cuda.CUresult.CUDA_SUCCESS
+            read_buffers.append(buf)
+            buf_int = int(buf)
+            cufile.buf_register(buf_int, buf_size, 0)
+
+        # Create fresh io_events array for read operations
+        io_events_read = cufile.IOEvents(num_operations)
+
+        # Set up read operations
+        for i in range(num_operations):
+            io_params[i].mode = cufile.BatchMode.BATCH  # Batch mode
+            io_params[i].fh = handle
+            io_params[i].opcode = cufile.Opcode.READ  # Read opcode
+            io_params[i].cookie = i + 100  # Different cookie for reads
+            io_params[i].u.batch.dev_ptr_base = int(read_buffers[i])
+            io_params[i].u.batch.file_offset = i * buf_size
+            io_params[i].u.batch.dev_ptr_offset = 0
+            io_params[i].u.batch.size_ = buf_size
+
+        # Submit batch read operations
+        cufile.batch_io_submit(batch_handle, num_operations, io_params.ptr, 0)
+
+        # Get batch status for reads
+        cufile.batch_io_get_status(
+            batch_handle, min_nr, ctypes.addressof(nr_completed), io_events_read.ptr, ctypes.addressof(timeout)
+        )
+
+        # Verify read operations completed successfully
+        assert nr_completed.value == num_operations, (
+            f"Expected {num_operations} read operations, got {nr_completed.value}"
+        )
+
+        # Collect all returned cookies for read operations
+        returned_cookies_read = set()
+        for i in range(num_operations):
+            assert io_events_read[i].status == cufile.Status.COMPLETE, (
+                f"Operation {i} failed with status {io_events_read[i].status}"
+            )
+            assert io_events_read[i].ret == buf_size, (
+                f"Expected {buf_size} bytes read, got {io_events_read[i].ret} for operation {i}"
+            )
+            returned_cookies_read.add(io_events_read[i].cookie)
+
+        # Verify all expected cookies are present
+        expected_cookies_read = set(range(100, 100 + num_operations))  # cookies 100, 101, 102, 103
+        assert returned_cookies_read == expected_cookies_read, (
+            f"Cookie mismatch. Expected {expected_cookies_read}, got {returned_cookies_read}"
+        )
+
+        # Verify the read data matches the written data
+        for i in range(num_operations):
+            # Copy read data back to host
+            cuda.cuMemcpyDtoHAsync(host_buf, read_buffers[i], buf_size, 0)
+            cuda.cuStreamSynchronize(0)
+            read_data = host_buf.value
+
+            # Prepare expected data
+            test_string = test_strings[i]
+            test_string_len = len(test_string)
+            repetitions = buf_size // test_string_len
+            expected_data = (test_string * repetitions)[:buf_size]
+
+            assert read_data == expected_data, f"Read data doesn't match written data for operation {i}"
+
+        # Clean up batch IO
+        cufile.batch_io_destroy(batch_handle)
+
+        # Deregister file handle
+        cufile.handle_deregister(handle)
+
+        # Deregister buffers
+        for buf in buffers + read_buffers:
+            buf_int = int(buf)
+            cufile.buf_deregister(buf_int)
+
+    finally:
+        # Close file
+        os.close(fd)
+        # Free CUDA memory
+        for buf in buffers + read_buffers:
+            cuda.cuMemFree(buf)
+        # Clean up test file
+        try:
+            os.unlink(file_path)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+        # Close cuFile driver
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
+def test_batch_io_cancel():
+    """Test batch IO cancellation."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Create test file
+    file_path = "test_batch_cancel.bin"
+
+    # Allocate CUDA memory
+    buf_size = 4096  # 4KB, aligned to 4096 bytes
+    num_operations = 2
+
+    buffers = []
+    for i in range(num_operations):
+        err, buf = cuda.cuMemAlloc(buf_size)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+        buffers.append(buf)
+
+    try:
+        # Create file with O_DIRECT
+        fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o644)
+
+        # Register buffers with cuFile
+        for buf in buffers:
+            buf_int = int(buf)
+            cufile.buf_register(buf_int, buf_size, 0)
+
+        # Create file descriptor
+        descr = cufile.Descr()
+        descr.type = cufile.FileHandleType.OPAQUE_FD
+        descr.handle.fd = fd
+        descr.fs_ops = 0
+
+        # Register file handle
+        handle = cufile.handle_register(descr.ptr)
+
+        # Set up batch IO
+        batch_handle = cufile.batch_io_set_up(num_operations)
+
+        # Create IOParams array for batch operations
+        io_params = cufile.IOParams(num_operations)
+
+        # Set up write operations
+        for i in range(num_operations):
+            io_params[i].mode = cufile.BatchMode.BATCH  # Batch mode
+            io_params[i].fh = handle
+            io_params[i].opcode = cufile.Opcode.WRITE  # Write opcode
+            io_params[i].cookie = i
+            io_params[i].u.batch.dev_ptr_base = int(buffers[i])
+            io_params[i].u.batch.file_offset = i * buf_size
+            io_params[i].u.batch.dev_ptr_offset = 0
+            io_params[i].u.batch.size_ = buf_size
+
+        # Submit batch operations
+        cufile.batch_io_submit(batch_handle, num_operations, io_params.ptr, 0)
+
+        # Cancel the batch operations
+        cufile.batch_io_cancel(batch_handle)
+
+        # Clean up batch IO
+        cufile.batch_io_destroy(batch_handle)
+
+        # Deregister file handle
+        cufile.handle_deregister(handle)
+
+        # Deregister buffers
+        for buf in buffers:
+            buf_int = int(buf)
+            cufile.buf_deregister(buf_int)
+
+    finally:
+        # Close file
+        os.close(fd)
+        # Free CUDA memory
+        for buf in buffers:
+            cuda.cuMemFree(buf)
+        # Clean up test file
+        try:
+            os.unlink(file_path)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+        # Close cuFile driver
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
+def test_batch_io_large_operations():
+    """Test batch IO with large buffer operations."""
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    # Open cuFile driver
+    cufile.driver_open()
+
+    # Create test file
+    file_path = "test_batch_large.bin"
+
+    # Allocate large CUDA memory (1MB, aligned to 4096 bytes)
+    buf_size = 1024 * 1024  # 1MB, aligned to 4096 bytes
+    num_operations = 2
+
+    write_buffers = []
+    read_buffers = []
+    all_buffers = []  # Initialize all_buffers to avoid UnboundLocalError
+
+    for i in range(num_operations):
+        err, buf = cuda.cuMemAlloc(buf_size)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+        write_buffers.append(buf)
+
+        err, buf = cuda.cuMemAlloc(buf_size)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+        read_buffers.append(buf)
+
+    # Allocate host memory for data verification
+    host_buf = ctypes.create_string_buffer(buf_size)
+
+    try:
+        # Create file with O_DIRECT
+        fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o644)
+
+        # Register all buffers with cuFile
+        all_buffers = write_buffers + read_buffers
+        for buf in all_buffers:
+            buf_int = int(buf)
+            cufile.buf_register(buf_int, buf_size, 0)
+
+        # Create file descriptor
+        descr = cufile.Descr()
+        descr.type = cufile.FileHandleType.OPAQUE_FD
+        descr.handle.fd = fd
+        descr.fs_ops = 0
+
+        # Register file handle
+        handle = cufile.handle_register(descr.ptr)
+
+        # Set up batch IO
+        batch_handle = cufile.batch_io_set_up(num_operations * 2)  # 2 writes + 2 reads
+
+        # Create IOParams array for batch operations
+        io_params = cufile.IOParams(num_operations * 2)
+        io_events = cufile.IOEvents(num_operations * 2)
+
+        # Prepare test data
+        test_strings = [
+            b"Large batch operation 1 data for testing cuFile with 1MB buffers! ",
+            b"Large batch operation 2 data for testing cuFile with 1MB buffers! ",
+        ]
+
+        # Prepare write data
+        for i in range(num_operations):
+            test_string = test_strings[i]
+            test_string_len = len(test_string)
+            repetitions = buf_size // test_string_len
+            test_data = test_string * repetitions
+            test_data = test_data[:buf_size]
+            host_buf = ctypes.create_string_buffer(test_data, buf_size)
+            cuda.cuMemcpyHtoDAsync(write_buffers[i], host_buf, buf_size, 0)
+            cuda.cuStreamSynchronize(0)
+
+        # Set up write operations
+        for i in range(num_operations):
+            io_params[i].mode = cufile.BatchMode.BATCH  # Batch mode
+            io_params[i].fh = handle
+            io_params[i].opcode = cufile.Opcode.WRITE  # Write opcode
+            io_params[i].cookie = i
+            io_params[i].u.batch.dev_ptr_base = int(write_buffers[i])
+            io_params[i].u.batch.file_offset = i * buf_size
+            io_params[i].u.batch.dev_ptr_offset = 0
+            io_params[i].u.batch.size_ = buf_size
+
+        # Set up read operations
+        for i in range(num_operations):
+            idx = i + num_operations
+            io_params[idx].mode = cufile.BatchMode.BATCH  # Batch mode
+            io_params[idx].fh = handle
+            io_params[idx].opcode = cufile.Opcode.READ  # Read opcode
+            io_params[idx].cookie = i + 100
+            io_params[idx].u.batch.dev_ptr_base = int(read_buffers[i])
+            io_params[idx].u.batch.file_offset = i * buf_size
+            io_params[idx].u.batch.dev_ptr_offset = 0
+            io_params[idx].u.batch.size_ = buf_size
+
+        # Submit batch operations
+        cufile.batch_io_submit(batch_handle, num_operations * 2, io_params.ptr, 0)
+
+        # Get batch status
+        min_nr = num_operations * 2  # Wait for all operations to complete
+        nr_completed = ctypes.c_uint(num_operations * 2)  # Initialize to max operations posted
+        timeout = ctypes.c_int(10000)  # 10 second timeout for large operations
+
+        cufile.batch_io_get_status(
+            batch_handle, min_nr, ctypes.addressof(nr_completed), io_events.ptr, ctypes.addressof(timeout)
+        )
+
+        # Verify all operations completed successfully
+        assert nr_completed.value == num_operations * 2, (
+            f"Expected {num_operations * 2} operations, got {nr_completed.value}"
+        )
+
+        # Collect all returned cookies
+        returned_cookies = set()
+        for i in range(num_operations * 2):
+            assert io_events[i].status == cufile.Status.COMPLETE, (
+                f"Operation {i} failed with status {io_events[i].status}"
+            )
+            returned_cookies.add(io_events[i].cookie)
+
+        # Verify all expected cookies are present
+        expected_cookies = set(range(num_operations)) | set(
+            range(100, 100 + num_operations)
+        )  # write cookies 0,1 + read cookies 100,101
+        assert returned_cookies == expected_cookies, (
+            f"Cookie mismatch. Expected {expected_cookies}, got {returned_cookies}"
+        )
+
+        # Verify the read data matches the written data
+        for i in range(num_operations):
+            # Copy read data back to host
+            cuda.cuMemcpyDtoHAsync(host_buf, read_buffers[i], buf_size, 0)
+            cuda.cuStreamSynchronize(0)
+            read_data = host_buf.value
+
+            # Prepare expected data
+            test_string = test_strings[i]
+            test_string_len = len(test_string)
+            repetitions = buf_size // test_string_len
+            expected_data = (test_string * repetitions)[:buf_size]
+
+            assert read_data == expected_data, f"Read data doesn't match written data for operation {i}"
+
+        # Clean up batch IO
+        cufile.batch_io_destroy(batch_handle)
+
+        # Deregister file handle
+        cufile.handle_deregister(handle)
+
+        # Deregister buffers
+        for buf in all_buffers:
+            buf_int = int(buf)
+            cufile.buf_deregister(buf_int)
+
+    finally:
+        # Close file
+        os.close(fd)
+        # Free CUDA memory
+        for buf in all_buffers:
+            cuda.cuMemFree(buf)
+        # Clean up test file
+        try:
+            os.unlink(file_path)
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+        # Close cuFile driver
+        cufile.driver_close()
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+@pytest.mark.skipif(
+    cufileVersionLessThan(1140), reason="cuFile parameter APIs require cuFile library version 1.14.0 or later"
+)
+def test_set_get_parameter_size_t():
+    """Test setting and getting size_t parameters with cuFile validation."""
+
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    try:
+        # Test setting and getting various size_t parameters
+
+        # Test poll threshold size (in KB)
+        poll_threshold_kb = 64  # 64KB threshold
+        cufile.set_parameter_size_t(cufile.SizeTConfigParameter.POLLTHRESHOLD_SIZE_KB, poll_threshold_kb)
+        retrieved_value = cufile.get_parameter_size_t(cufile.SizeTConfigParameter.POLLTHRESHOLD_SIZE_KB)
+        assert retrieved_value == poll_threshold_kb, (
+            f"Poll threshold mismatch: set {poll_threshold_kb}, got {retrieved_value}"
+        )
+
+        # Test max direct IO size (in KB)
+        max_direct_io_kb = 1024  # 1MB max direct IO size
+        cufile.set_parameter_size_t(cufile.SizeTConfigParameter.PROPERTIES_MAX_DIRECT_IO_SIZE_KB, max_direct_io_kb)
+        retrieved_value = cufile.get_parameter_size_t(cufile.SizeTConfigParameter.PROPERTIES_MAX_DIRECT_IO_SIZE_KB)
+        assert retrieved_value == max_direct_io_kb, (
+            f"Max direct IO size mismatch: set {max_direct_io_kb}, got {retrieved_value}"
+        )
+
+        # Test max device cache size (in KB)
+        max_cache_kb = 512  # 512KB max cache size
+        cufile.set_parameter_size_t(cufile.SizeTConfigParameter.PROPERTIES_MAX_DEVICE_CACHE_SIZE_KB, max_cache_kb)
+        retrieved_value = cufile.get_parameter_size_t(cufile.SizeTConfigParameter.PROPERTIES_MAX_DEVICE_CACHE_SIZE_KB)
+        assert retrieved_value == max_cache_kb, f"Max cache size mismatch: set {max_cache_kb}, got {retrieved_value}"
+
+        # Test per buffer cache size (in KB)
+        per_buffer_cache_kb = 128  # 128KB per buffer cache
+        cufile.set_parameter_size_t(
+            cufile.SizeTConfigParameter.PROPERTIES_PER_BUFFER_CACHE_SIZE_KB, per_buffer_cache_kb
+        )
+        retrieved_value = cufile.get_parameter_size_t(cufile.SizeTConfigParameter.PROPERTIES_PER_BUFFER_CACHE_SIZE_KB)
+        assert retrieved_value == per_buffer_cache_kb, (
+            f"Per buffer cache size mismatch: set {per_buffer_cache_kb}, got {retrieved_value}"
+        )
+
+        # Test max device pinned memory size (in KB)
+        max_pinned_kb = 2048  # 2MB max pinned memory
+        cufile.set_parameter_size_t(cufile.SizeTConfigParameter.PROPERTIES_MAX_DEVICE_PINNED_MEM_SIZE_KB, max_pinned_kb)
+        retrieved_value = cufile.get_parameter_size_t(
+            cufile.SizeTConfigParameter.PROPERTIES_MAX_DEVICE_PINNED_MEM_SIZE_KB
+        )
+        assert retrieved_value == max_pinned_kb, (
+            f"Max pinned memory size mismatch: set {max_pinned_kb}, got {retrieved_value}"
+        )
+
+        # Test IO batch size
+        batch_size = 16  # 16 operations per batch
+        cufile.set_parameter_size_t(cufile.SizeTConfigParameter.PROPERTIES_IO_BATCHSIZE, batch_size)
+        retrieved_value = cufile.get_parameter_size_t(cufile.SizeTConfigParameter.PROPERTIES_IO_BATCHSIZE)
+        assert retrieved_value == batch_size, f"IO batch size mismatch: set {batch_size}, got {retrieved_value}"
+
+        # Test batch IO timeout (in milliseconds)
+        timeout_ms = 5000  # 5 second timeout
+        cufile.set_parameter_size_t(cufile.SizeTConfigParameter.PROPERTIES_BATCH_IO_TIMEOUT_MS, timeout_ms)
+        retrieved_value = cufile.get_parameter_size_t(cufile.SizeTConfigParameter.PROPERTIES_BATCH_IO_TIMEOUT_MS)
+        assert retrieved_value == timeout_ms, f"Batch IO timeout mismatch: set {timeout_ms}, got {retrieved_value}"
+
+        # Test execution parameters
+        max_io_queue_depth = 32  # Max 32 operations in queue
+        cufile.set_parameter_size_t(cufile.SizeTConfigParameter.EXECUTION_MAX_IO_QUEUE_DEPTH, max_io_queue_depth)
+        retrieved_value = cufile.get_parameter_size_t(cufile.SizeTConfigParameter.EXECUTION_MAX_IO_QUEUE_DEPTH)
+        assert retrieved_value == max_io_queue_depth, (
+            f"Max IO queue depth mismatch: set {max_io_queue_depth}, got {retrieved_value}"
+        )
+
+        max_io_threads = 8  # Max 8 IO threads
+        cufile.set_parameter_size_t(cufile.SizeTConfigParameter.EXECUTION_MAX_IO_THREADS, max_io_threads)
+        retrieved_value = cufile.get_parameter_size_t(cufile.SizeTConfigParameter.EXECUTION_MAX_IO_THREADS)
+        assert retrieved_value == max_io_threads, (
+            f"Max IO threads mismatch: set {max_io_threads}, got {retrieved_value}"
+        )
+
+        min_io_threshold_kb = 4  # 4KB minimum IO threshold
+        cufile.set_parameter_size_t(cufile.SizeTConfigParameter.EXECUTION_MIN_IO_THRESHOLD_SIZE_KB, min_io_threshold_kb)
+        retrieved_value = cufile.get_parameter_size_t(cufile.SizeTConfigParameter.EXECUTION_MIN_IO_THRESHOLD_SIZE_KB)
+        assert retrieved_value == min_io_threshold_kb, (
+            f"Min IO threshold mismatch: set {min_io_threshold_kb}, got {retrieved_value}"
+        )
+
+        max_request_parallelism = 4  # Max 4 parallel requests
+        cufile.set_parameter_size_t(
+            cufile.SizeTConfigParameter.EXECUTION_MAX_REQUEST_PARALLELISM, max_request_parallelism
+        )
+        retrieved_value = cufile.get_parameter_size_t(cufile.SizeTConfigParameter.EXECUTION_MAX_REQUEST_PARALLELISM)
+        assert retrieved_value == max_request_parallelism, (
+            f"Max request parallelism mismatch: set {max_request_parallelism}, got {retrieved_value}"
+        )
+
+    finally:
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+@pytest.mark.skipif(
+    cufileVersionLessThan(1140), reason="cuFile parameter APIs require cuFile library version 1.14.0 or later"
+)
+def test_set_get_parameter_bool():
+    """Test setting and getting boolean parameters with cuFile validation."""
+
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    try:
+        # Test setting and getting various boolean parameters
+
+        # Test poll mode
+        cufile.set_parameter_bool(cufile.BoolConfigParameter.PROPERTIES_USE_POLL_MODE, True)
+        retrieved_value = cufile.get_parameter_bool(cufile.BoolConfigParameter.PROPERTIES_USE_POLL_MODE)
+        assert retrieved_value is True, f"Poll mode mismatch: set True, got {retrieved_value}"
+
+        # Test compatibility mode
+        cufile.set_parameter_bool(cufile.BoolConfigParameter.PROPERTIES_ALLOW_COMPAT_MODE, False)
+        retrieved_value = cufile.get_parameter_bool(cufile.BoolConfigParameter.PROPERTIES_ALLOW_COMPAT_MODE)
+        assert retrieved_value is False, f"Compatibility mode mismatch: set False, got {retrieved_value}"
+
+        # Test force compatibility mode
+        cufile.set_parameter_bool(cufile.BoolConfigParameter.FORCE_COMPAT_MODE, False)
+        retrieved_value = cufile.get_parameter_bool(cufile.BoolConfigParameter.FORCE_COMPAT_MODE)
+        assert retrieved_value is False, f"Force compatibility mode mismatch: set False, got {retrieved_value}"
+
+        # Test aggressive API check
+        cufile.set_parameter_bool(cufile.BoolConfigParameter.FS_MISC_API_CHECK_AGGRESSIVE, True)
+        retrieved_value = cufile.get_parameter_bool(cufile.BoolConfigParameter.FS_MISC_API_CHECK_AGGRESSIVE)
+        assert retrieved_value is True, f"Aggressive API check mismatch: set True, got {retrieved_value}"
+
+        # Test parallel IO
+        cufile.set_parameter_bool(cufile.BoolConfigParameter.EXECUTION_PARALLEL_IO, True)
+        retrieved_value = cufile.get_parameter_bool(cufile.BoolConfigParameter.EXECUTION_PARALLEL_IO)
+        assert retrieved_value is True, f"Parallel IO mismatch: set True, got {retrieved_value}"
+
+        # Test NVTX profiling
+        cufile.set_parameter_bool(cufile.BoolConfigParameter.PROFILE_NVTX, False)
+        retrieved_value = cufile.get_parameter_bool(cufile.BoolConfigParameter.PROFILE_NVTX)
+        assert retrieved_value is False, f"NVTX profiling mismatch: set False, got {retrieved_value}"
+
+        # Test system memory allowance
+        cufile.set_parameter_bool(cufile.BoolConfigParameter.PROPERTIES_ALLOW_SYSTEM_MEMORY, True)
+        retrieved_value = cufile.get_parameter_bool(cufile.BoolConfigParameter.PROPERTIES_ALLOW_SYSTEM_MEMORY)
+        assert retrieved_value is True, f"System memory allowance mismatch: set True, got {retrieved_value}"
+
+        # Test PCI P2P DMA
+        cufile.set_parameter_bool(cufile.BoolConfigParameter.USE_PCIP2PDMA, True)
+        retrieved_value = cufile.get_parameter_bool(cufile.BoolConfigParameter.USE_PCIP2PDMA)
+        assert retrieved_value is True, f"PCI P2P DMA mismatch: set True, got {retrieved_value}"
+
+        # Test IO uring preference
+        cufile.set_parameter_bool(cufile.BoolConfigParameter.PREFER_IO_URING, False)
+        retrieved_value = cufile.get_parameter_bool(cufile.BoolConfigParameter.PREFER_IO_URING)
+        assert retrieved_value is False, f"IO uring preference mismatch: set False, got {retrieved_value}"
+
+        # Test force O_DIRECT mode
+        cufile.set_parameter_bool(cufile.BoolConfigParameter.FORCE_ODIRECT_MODE, True)
+        retrieved_value = cufile.get_parameter_bool(cufile.BoolConfigParameter.FORCE_ODIRECT_MODE)
+        assert retrieved_value is True, f"Force O_DIRECT mode mismatch: set True, got {retrieved_value}"
+
+        # Test topology detection skip
+        cufile.set_parameter_bool(cufile.BoolConfigParameter.SKIP_TOPOLOGY_DETECTION, False)
+        retrieved_value = cufile.get_parameter_bool(cufile.BoolConfigParameter.SKIP_TOPOLOGY_DETECTION)
+        assert retrieved_value is False, f"Topology detection skip mismatch: set False, got {retrieved_value}"
+
+        # Test stream memops bypass
+        cufile.set_parameter_bool(cufile.BoolConfigParameter.STREAM_MEMOPS_BYPASS, True)
+        retrieved_value = cufile.get_parameter_bool(cufile.BoolConfigParameter.STREAM_MEMOPS_BYPASS)
+        assert retrieved_value is True, f"Stream memops bypass mismatch: set True, got {retrieved_value}"
+
+    finally:
+        cuda.cuDevicePrimaryCtxRelease(device)
+
+
+@pytest.mark.skipif(
+    cufileVersionLessThan(1140), reason="cuFile parameter APIs require cuFile library version 1.14.0 or later"
+)
+def test_set_get_parameter_string():
+    """Test setting and getting string parameters with cuFile validation."""
+
+    # Initialize CUDA
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    err, ctx = cuda.cuDevicePrimaryCtxRetain(device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxSetCurrent(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    try:
+        # Test setting and getting various string parameters
+        # Note: String parameter tests may have issues with the current implementation
+
+        # Test logging level
+        logging_level = "INFO"
+        try:
+            # Convert Python string to null-terminated C string
+            logging_level_bytes = logging_level.encode("utf-8") + b"\x00"
+            logging_level_buffer = ctypes.create_string_buffer(logging_level_bytes)
+            cufile.set_parameter_string(
+                cufile.StringConfigParameter.LOGGING_LEVEL, int(ctypes.addressof(logging_level_buffer))
+            )
+            retrieved_value_raw = cufile.get_parameter_string(cufile.StringConfigParameter.LOGGING_LEVEL, 256)
+            # Use safe_decode_string to handle null terminators and padding
+            retrieved_value = safe_decode_string(retrieved_value_raw.encode("utf-8"))
+            print(f"Logging level test: set {logging_level}, got {retrieved_value}")
+            # The retrieved value should be a string, so we can compare directly
+            assert retrieved_value == logging_level, (
+                f"Logging level mismatch: set {logging_level}, got {retrieved_value}"
+            )
+        except Exception as e:
+            print(f"Logging level test failed: {e}")
+            # Re-raise the exception to make the test fail
+            raise
+
+        # Test environment log file path
+        logfile_path = tempfile.gettempdir() + "/cufile.log"
+        try:
+            # Convert Python string to null-terminated C string
+            logfile_path_bytes = logfile_path.encode("utf-8") + b"\x00"
+            logfile_buffer = ctypes.create_string_buffer(logfile_path_bytes)
+            cufile.set_parameter_string(
+                cufile.StringConfigParameter.ENV_LOGFILE_PATH, int(ctypes.addressof(logfile_buffer))
+            )
+            retrieved_value_raw = cufile.get_parameter_string(cufile.StringConfigParameter.ENV_LOGFILE_PATH, 256)
+            # Use safe_decode_string to handle null terminators and padding
+            retrieved_value = safe_decode_string(retrieved_value_raw.encode("utf-8"))
+            print(f"Log file path test: set {logfile_path}, got {retrieved_value}")
+            # The retrieved value should be a string, so we can compare directly
+            assert retrieved_value == logfile_path, f"Log file path mismatch: set {logfile_path}, got {retrieved_value}"
+        except Exception as e:
+            print(f"Log file path test failed: {e}")
+            # Re-raise the exception to make the test fail
+            raise
+
+        # Test log directory
+        log_dir = tempfile.gettempdir() + "/cufile_logs"
+        try:
+            # Convert Python string to null-terminated C string
+            log_dir_bytes = log_dir.encode("utf-8") + b"\x00"
+            log_dir_buffer = ctypes.create_string_buffer(log_dir_bytes)
+            cufile.set_parameter_string(cufile.StringConfigParameter.LOG_DIR, int(ctypes.addressof(log_dir_buffer)))
+            retrieved_value_raw = cufile.get_parameter_string(cufile.StringConfigParameter.LOG_DIR, 256)
+            # Use safe_decode_string to handle null terminators and padding
+            retrieved_value = safe_decode_string(retrieved_value_raw.encode("utf-8"))
+            print(f"Log directory test: set {log_dir}, got {retrieved_value}")
+            # The retrieved value should be a string, so we can compare directly
+            assert retrieved_value == log_dir, f"Log directory mismatch: set {log_dir}, got {retrieved_value}"
+        except Exception as e:
+            print(f"Log directory test failed: {e}")
+            # Re-raise the exception to make the test fail
+            raise
+
+    finally:
+        cuda.cuDevicePrimaryCtxRelease(device)