From 94a9337e827aae5119fdfedc0bbb2f687df80647 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Tue, 9 May 2023 16:30:24 +0100 Subject: [PATCH 01/42] [SYCL][PI][UR][HIP] Port device, context, and platform --- sycl/plugins/hip/CMakeLists.txt | 17 + sycl/plugins/hip/pi_hip.cpp | 1552 +---------------- sycl/plugins/hip/pi_hip.hpp | 111 +- sycl/plugins/unified_runtime/CMakeLists.txt | 37 + sycl/plugins/unified_runtime/pi2ur.hpp | 2 +- .../ur/adapters/hip/common.cpp | 84 + .../ur/adapters/hip/common.hpp | 99 ++ .../ur/adapters/hip/context.cpp | 185 ++ .../ur/adapters/hip/context.hpp | 122 ++ .../ur/adapters/hip/device.cpp | 918 ++++++++++ .../ur/adapters/hip/device.hpp | 43 + .../ur/adapters/hip/platform.cpp | 141 ++ .../ur/adapters/hip/platform.hpp | 18 + .../ur/adapters/hip/ur_interface_loader.cpp | 258 +++ sycl/plugins/unified_runtime/ur/ur.hpp | 4 + 15 files changed, 2002 insertions(+), 1589 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt index 5af49f82e2b6e..dca1167124811 100644 --- a/sycl/plugins/hip/CMakeLists.txt +++ b/sycl/plugins/hip/CMakeLists.txt @@ -87,12 +87,29 @@ set(HIP_HEADERS "${PI_HIP_INCLUDE_DIR};${PI_HIP_HSA_INCLUDE_DIR}") # Create pi_hip library add_sycl_plugin(hip SOURCES + # Some code is shared with the UR adapter + "../unified_runtime/pi2ur.hpp" + "../unified_runtime/pi2ur.cpp" + "../unified_runtime/ur/ur.hpp" + "../unified_runtime/ur/ur.cpp" + "../unified_runtime/ur/adapters/hip/common.cpp" + "../unified_runtime/ur/adapters/hip/common.hpp" + "../unified_runtime/ur/adapters/hip/context.cpp" + "../unified_runtime/ur/adapters/hip/context.hpp" + "../unified_runtime/ur/adapters/hip/device.cpp" + "../unified_runtime/ur/adapters/hip/device.hpp" + "../unified_runtime/ur/adapters/hip/platform.cpp" + "../unified_runtime/ur/adapters/hip/platform.hpp" + "../unified_runtime/ur/adapters/hip/ur_interface_loader.cpp" "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" "pi_hip.hpp" "pi_hip.cpp" INCLUDE_DIRS ${sycl_plugin_dir} + ${CMAKE_CURRENT_SOURCE_DIR}/../unified_runtime + LIBRARIES + UnifiedRuntime-Headers HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/features.hpp ) diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 5ad0279b217f6..fdf335c923c99 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -28,73 +28,6 @@ #include namespace { -// Hipify doesn't support cuArrayGetDescriptor, on AMD the hipArray can just be -// indexed, but on NVidia it is an opaque type and needs to go through -// cuArrayGetDescriptor so implement a utility function to get the array -// properties -inline void getArrayDesc(hipArray *array, hipArray_Format &format, - size_t &channels) { -#if defined(__HIP_PLATFORM_AMD__) - format = array->Format; - channels = array->NumChannels; -#elif defined(__HIP_PLATFORM_NVIDIA__) - CUDA_ARRAY_DESCRIPTOR arrayDesc; - cuArrayGetDescriptor(&arrayDesc, (CUarray)array); - - format = arrayDesc.Format; - channels = arrayDesc.NumChannels; -#else -#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); -#endif -} - -// NVidia HIP headers guard hipArray3DCreate behind __CUDACC__, this does not -// seem to be required and we're not using nvcc to build the HIP PI plugin so -// add the translation function here -#if defined(__HIP_PLATFORM_NVIDIA__) && !defined(__CUDACC__) -inline static hipError_t -hipArray3DCreate(hiparray *pHandle, - const HIP_ARRAY3D_DESCRIPTOR *pAllocateArray) { - return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray)); -} -#endif - -// hipArray gets turned into cudaArray when using the HIP NVIDIA platform, and -// some CUDA APIs use cudaArray* and others use CUarray, these two represent the -// same type, however when building cudaArray appears as an opaque type, so it -// needs to be explicitly casted to CUarray. In order for this to work for both -// AMD and NVidia we introduce an second hipArray type that will be CUarray for -// NVIDIA and hipArray* for AMD so that we can place the explicit casts when -// necessary for NVIDIA and they will be no-ops for AMD. -#if defined(__HIP_PLATFORM_NVIDIA__) -typedef CUarray hipCUarray; -#elif defined(__HIP_PLATFORM_AMD__) -typedef hipArray *hipCUarray; -#else -#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); -#endif - -// Add missing HIP to CUDA defines -#if defined(__HIP_PLATFORM_NVIDIA__) -#define hipMemoryType CUmemorytype -#define hipMemoryTypeHost CU_MEMORYTYPE_HOST -#define hipMemoryTypeDevice CU_MEMORYTYPE_DEVICE -#define hipMemoryTypeArray CU_MEMORYTYPE_ARRAY -#define hipMemoryTypeUnified CU_MEMORYTYPE_UNIFIED -#endif - -std::string getHipVersionString() { - int driver_version = 0; - if (hipDriverGetVersion(&driver_version) != hipSuccess) { - return ""; - } - // The version is returned as (1000 major + 10 minor). - std::stringstream stream; - stream << "HIP " << driver_version / 1000 << "." - << driver_version % 1000 / 10; - return stream.str(); -} - pi_result map_error(hipError_t result) { switch (result) { case hipSuccess: @@ -235,49 +168,6 @@ pi_result check_error(hipError_t result, const char *function, int line, /// \cond NODOXY #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__) -/// RAII type to guarantee recovering original HIP context -/// Scoped context is used across all PI HIP plugin implementation -/// to activate the PI Context on the current thread, matching the -/// HIP driver semantics where the context used for the HIP Driver -/// API is the one active on the thread. -/// The implementation tries to avoid replacing the hipCtx_t if it cans -class ScopedContext { - pi_context placedContext_; - hipCtx_t original_; - bool needToRecover_; - -public: - ScopedContext(pi_context ctxt) : placedContext_{ctxt}, needToRecover_{false} { - - if (!placedContext_) { - throw PI_ERROR_INVALID_CONTEXT; - } - - hipCtx_t desired = placedContext_->get(); - PI_CHECK_ERROR(hipCtxGetCurrent(&original_)); - if (original_ != desired) { - // Sets the desired context as the active one for the thread - PI_CHECK_ERROR(hipCtxSetCurrent(desired)); - if (original_ == nullptr) { - // No context is installed on the current thread - // This is the most common case. We can activate the context in the - // thread and leave it there until all the PI context referring to the - // same underlying HIP context are destroyed. This emulates - // the behaviour of the HIP runtime api, and avoids costly context - // switches. No action is required on this side of the if. - } else { - needToRecover_ = true; - } - } - } - - ~ScopedContext() { - if (needToRecover_) { - PI_CHECK_ERROR(hipCtxSetCurrent(original_)); - } - } -}; - /// \cond NODOXY template pi_result getInfoImpl(size_t param_value_size, void *param_value, @@ -334,11 +224,28 @@ pi_result getInfo(size_t param_value_size, void *param_value, param_value_size_ret, value); } -int getAttribute(pi_device device, hipDeviceAttribute_t attribute) { - int value; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&value, attribute, device->get()) == hipSuccess); - return value; +ScopedContext::ScopedContext(pi_context ctxt) + : placedContext_{ctxt}, needToRecover_{false} { + if (!placedContext_) { + throw PI_ERROR_INVALID_CONTEXT; + } + + hipCtx_t desired = placedContext_->get(); + PI_CHECK_ERROR(hipCtxGetCurrent(&original_)); + if (original_ != desired) { + // Sets the desired context as the active one for the thread + PI_CHECK_ERROR(hipCtxSetCurrent(desired)); + if (original_ == nullptr) { + // No context is installed on the current thread + // This is the most common case. We can activate the context in the + // thread and leave it there until all the PI context referring to the + // same underlying HIP context are destroyed. This emulates + // the behaviour of the HIP runtime api, and avoids costly context + // switches. No action is required on this side of the if. + } else { + needToRecover_ = true; + } + } } /// \endcond @@ -567,14 +474,14 @@ _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue, if (queue_ != nullptr) { hip_piQueueRetain(queue_); } - hip_piContextRetain(context_); + pi2ur::piContextRetain(context_); } _pi_event::~_pi_event() { if (queue_ != nullptr) { hip_piQueueRelease(queue_); } - hip_piContextRelease(context_); + pi2ur::piContextRelease(context_); } pi_result _pi_event::start() { @@ -719,12 +626,12 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { } _pi_program::_pi_program(pi_context ctxt) - : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1}, - context_{ctxt} { - hip_piContextRetain(context_); + : module_{nullptr}, binary_{}, + binarySizeInBytes_{0}, refCount_{1}, context_{ctxt} { + pi2ur::piContextRetain(context_); } -_pi_program::~_pi_program() { hip_piContextRelease(context_); } +_pi_program::~_pi_program() { pi2ur::piContextRelease(context_); } pi_result _pi_program::set_binary(const char *source, size_t length) { assert((binary_ == nullptr && binarySizeInBytes_ == 0) && @@ -789,11 +696,11 @@ template class ReleaseGuard { T Captive; static pi_result callRelease(pi_device Captive) { - return hip_piDeviceRelease(Captive); + return pi2ur::piDeviceRelease(Captive); } static pi_result callRelease(pi_context Captive) { - return hip_piContextRelease(Captive); + return pi2ur::piContextRelease(Captive); } static pi_result callRelease(pi_mem Captive) { @@ -858,240 +765,6 @@ template class ReleaseGuard { //-- PI API implementation extern "C" { -/// Obtains the HIP platform. -/// There is only one HIP platform, and contains all devices on the system. -/// Triggers the HIP Driver initialization (hipInit) the first time, so this -/// must be the first PI API called. -/// -/// However because multiple devices in a context is not currently supported, -/// place each device in a separate platform. -/// -pi_result hip_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, - pi_uint32 *num_platforms) { - - try { - static std::once_flag initFlag; - static pi_uint32 numPlatforms = 1; - static std::vector<_pi_platform> platformIds; - - if (num_entries == 0 and platforms != nullptr) { - return PI_ERROR_INVALID_VALUE; - } - if (platforms == nullptr and num_platforms == nullptr) { - return PI_ERROR_INVALID_VALUE; - } - - pi_result err = PI_SUCCESS; - - std::call_once( - initFlag, - [](pi_result &err) { - if (hipInit(0) != hipSuccess) { - numPlatforms = 0; - return; - } - int numDevices = 0; - hipError_t hipErrorCode = hipGetDeviceCount(&numDevices); - if (hipErrorCode == hipErrorNoDevice) { - numPlatforms = 0; - return; - } - err = PI_CHECK_ERROR(hipErrorCode); - if (numDevices == 0) { - numPlatforms = 0; - return; - } - try { - numPlatforms = numDevices; - platformIds.resize(numDevices); - - for (int i = 0; i < numDevices; ++i) { - hipDevice_t device; - err = PI_CHECK_ERROR(hipDeviceGet(&device, i)); - platformIds[i].devices_.emplace_back( - new _pi_device{device, &platformIds[i]}); - } - } catch (const std::bad_alloc &) { - // Signal out-of-memory situation - for (int i = 0; i < numDevices; ++i) { - platformIds[i].devices_.clear(); - } - platformIds.clear(); - err = PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - // Clear and rethrow to allow retry - for (int i = 0; i < numDevices; ++i) { - platformIds[i].devices_.clear(); - } - platformIds.clear(); - throw; - } - }, - err); - - if (num_platforms != nullptr) { - *num_platforms = numPlatforms; - } - - if (platforms != nullptr) { - for (unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) { - platforms[i] = &platformIds[i]; - } - } - - return err; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_OUT_OF_RESOURCES; - } -} - -pi_result hip_piPlatformGetInfo([[maybe_unused]] pi_platform platform, - pi_platform_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - assert(platform != nullptr); - - switch (param_name) { - case PI_PLATFORM_INFO_NAME: - return getInfo(param_value_size, param_value, param_value_size_ret, - "AMD HIP BACKEND"); - case PI_PLATFORM_INFO_VENDOR: - return getInfo(param_value_size, param_value, param_value_size_ret, - "AMD Corporation"); - case PI_PLATFORM_INFO_PROFILE: - return getInfo(param_value_size, param_value, param_value_size_ret, - "FULL PROFILE"); - case PI_PLATFORM_INFO_VERSION: { - auto version = getHipVersionString(); - return getInfo(param_value_size, param_value, param_value_size_ret, - version.c_str()); - } - case PI_PLATFORM_INFO_EXTENSIONS: { - return getInfo(param_value_size, param_value, param_value_size_ret, ""); - } - case PI_EXT_PLATFORM_INFO_BACKEND: { - return getInfo(param_value_size, param_value, - param_value_size_ret, - PI_EXT_PLATFORM_BACKEND_HIP); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - sycl::detail::pi::die("Platform info request not implemented"); - return {}; -} - -/// \param devices List of devices available on the system -/// \param num_devices Number of elements in the list of devices -/// Requesting a non-GPU device triggers an error, all PI HIP devices -/// are GPUs. -/// -pi_result hip_piDevicesGet(pi_platform platform, pi_device_type device_type, - pi_uint32 num_entries, pi_device *devices, - pi_uint32 *num_devices) { - - pi_result err = PI_SUCCESS; - const bool askingForDefault = device_type == PI_DEVICE_TYPE_DEFAULT; - const bool askingForGPU = device_type & PI_DEVICE_TYPE_GPU; - const bool returnDevices = askingForDefault || askingForGPU; - - size_t numDevices = returnDevices ? platform->devices_.size() : 0; - - try { - if (num_devices) { - *num_devices = numDevices; - } - - if (returnDevices && devices) { - for (size_t i = 0; i < std::min(size_t(num_entries), numDevices); ++i) { - devices[i] = platform->devices_[i].get(); - } - } - - return err; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_OUT_OF_RESOURCES; - } -} - -/// \return PI_SUCCESS if the function is exehipted successfully -/// HIP devices are always root devices so retain always returns success. -pi_result hip_piDeviceRetain(pi_device device) { - (void)device; - return PI_SUCCESS; -} - -pi_result hip_piContextGetInfo(pi_context context, pi_context_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - - switch (param_name) { - case PI_CONTEXT_INFO_NUM_DEVICES: - return getInfo(param_value_size, param_value, param_value_size_ret, 1); - case PI_CONTEXT_INFO_DEVICES: - return getInfo(param_value_size, param_value, param_value_size_ret, - context->get_device()); - case PI_CONTEXT_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, param_value_size_ret, - context->get_reference_count()); - case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: - return getInfo(param_value_size, param_value, param_value_size_ret, - true); - case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT: - case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT: - // 2D USM operations currently not supported. - return getInfo(param_value_size, param_value, param_value_size_ret, - false); - case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: - case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: - case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: - case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { - // These queries should be dealt with in context_impl.cpp by calling the - // queries of each device separately and building the intersection set. - setErrorMessage("These queries should have never come here.", - PI_ERROR_INVALID_ARG_VALUE); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - - return PI_ERROR_OUT_OF_RESOURCES; -} - -pi_result hip_piContextRetain(pi_context context) { - assert(context != nullptr); - assert(context->get_reference_count() > 0); - - context->increment_reference_count(); - return PI_SUCCESS; -} - -pi_result hip_piextContextSetExtendedDeleter( - pi_context context, pi_context_extended_deleter function, void *user_data) { - context->set_extended_deleter(function, user_data); - return PI_SUCCESS; -} - -/// Not applicable to HIP, devices cannot be partitioned. -/// -pi_result hip_piDevicePartition(pi_device device, - const pi_device_partition_property *properties, - pi_uint32 num_devices, pi_device *out_devices, - pi_uint32 *out_num_devices) { - (void)device; - (void)properties; - (void)num_devices; - (void)out_devices; - (void)out_num_devices; - - return PI_ERROR_INVALID_OPERATION; -} - /// \return If available, the first binary that is PTX /// pi_result hip_piextDeviceSelectBinary(pi_device device, @@ -1150,1110 +823,6 @@ pi_result hip_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device, return retError; } -/// \return PI_SUCCESS always since HIP devices are always root devices. -/// -pi_result hip_piDeviceRelease(pi_device device) { - (void)device; - return PI_SUCCESS; -} - -pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - - static constexpr pi_uint32 max_work_item_dimensions = 3u; - - assert(device != nullptr); - - switch (param_name) { - case PI_DEVICE_INFO_TYPE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_DEVICE_TYPE_GPU); - } - case PI_DEVICE_INFO_VENDOR_ID: { -#if defined(__HIP_PLATFORM_AMD__) - pi_uint32 vendor_id = 4098u; -#elif defined(__HIP_PLATFORM_NVIDIA__) - pi_uint32 vendor_id = 4318u; -#else - pi_uint32 vendor_id = 0u; -#endif - - return getInfo(param_value_size, param_value, param_value_size_ret, - vendor_id); - } - case PI_DEVICE_INFO_MAX_COMPUTE_UNITS: { - int compute_units = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&compute_units, - hipDeviceAttributeMultiprocessorCount, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(compute_units >= 0); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint32(compute_units)); - } - case PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { - return getInfo(param_value_size, param_value, param_value_size_ret, - max_work_item_dimensions); - } - case PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { - size_t return_sizes[max_work_item_dimensions]; - - int max_x = 0, max_y = 0, max_z = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxBlockDimX, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(max_x >= 0); - - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxBlockDimY, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(max_y >= 0); - - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxBlockDimZ, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(max_z >= 0); - - return_sizes[0] = size_t(max_x); - return_sizes[1] = size_t(max_y); - return_sizes[2] = size_t(max_z); - return getInfoArray(max_work_item_dimensions, param_value_size, param_value, - param_value_size_ret, return_sizes); - } - - case PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D: { - size_t return_sizes[max_work_item_dimensions]; - int max_x = 0, max_y = 0, max_z = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxGridDimX, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(max_x >= 0); - - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxGridDimY, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(max_y >= 0); - - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxGridDimZ, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(max_z >= 0); - - return_sizes[0] = size_t(max_x); - return_sizes[1] = size_t(max_y); - return_sizes[2] = size_t(max_z); - return getInfoArray(max_work_item_dimensions, param_value_size, param_value, - param_value_size_ret, return_sizes); - } - - case PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { - int max_work_group_size = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_work_group_size, - hipDeviceAttributeMaxThreadsPerBlock, - device->get()) == hipSuccess); - - sycl::detail::pi::assertion(max_work_group_size >= 0); - - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t(max_work_group_size)); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: { - return getInfo(param_value_size, param_value, param_value_size_ret, 0u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: { - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - } - case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: { - return getInfo(param_value_size, param_value, param_value_size_ret, 0u); - } - case PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { - // Number of sub-groups = max block size / warp size + possible remainder - int max_threads = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_threads, - hipDeviceAttributeMaxThreadsPerBlock, - device->get()) == hipSuccess); - int warpSize = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, - device->get()) == hipSuccess); - int maxWarps = (max_threads + warpSize - 1) / warpSize; - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(maxWarps)); - } - case PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { - // Volta provides independent thread scheduling - // TODO: Revisit for previous generation GPUs - int major = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, - device->get()) == hipSuccess); - bool ifp = (major >= 7); - return getInfo(param_value_size, param_value, param_value_size_ret, ifp); - } - case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { - int warpSize = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, - device->get()) == hipSuccess); - size_t sizes[1] = {static_cast(warpSize)}; - return getInfoArray(1, param_value_size, param_value, - param_value_size_ret, sizes); - } - case PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { - int clock_freq = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&clock_freq, hipDeviceAttributeClockRate, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(clock_freq >= 0); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint32(clock_freq) / 1000u); - } - case PI_DEVICE_INFO_ADDRESS_BITS: { - auto bits = pi_uint32{std::numeric_limits::digits}; - return getInfo(param_value_size, param_value, param_value_size_ret, bits); - } - case PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: { - // Max size of memory object allocation in bytes. - // The minimum value is max(min(1024 × 1024 × - // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE), - // 32 × 1024 × 1024) for devices that are not of type - // CL_DEVICE_TYPE_HIPSTOM. - - size_t global = 0; - sycl::detail::pi::assertion(hipDeviceTotalMem(&global, device->get()) == - hipSuccess); - - auto quarter_global = static_cast(global / 4u); - - auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global), - 32u * 1024u * 1024u); - - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64{max_alloc}); - } - case PI_DEVICE_INFO_IMAGE_SUPPORT: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS: { - // This call doesn't match to HIP as it doesn't have images, but instead - // surfaces and textures. No clear call in the HIP API to determine this, - // but some searching found as of SM 2.x 128 are supported. - return getInfo(param_value_size, param_value, param_value_size_ret, 128u); - } - case PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: { - // This call doesn't match to HIP as it doesn't have images, but instead - // surfaces and textures. No clear call in the HIP API to determine this, - // but some searching found as of SM 2.x 128 are supported. - return getInfo(param_value_size, param_value, param_value_size_ret, 128u); - } - - case PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: { - // Take the smaller of maximum surface and maximum texture height. - int tex_height = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture2DHeight, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(tex_height >= 0); - int surf_height = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&surf_height, - hipDeviceAttributeMaxTexture2DHeight, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(surf_height >= 0); - - int min = std::min(tex_height, surf_height); - - return getInfo(param_value_size, param_value, param_value_size_ret, min); - } - case PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH: { - // Take the smaller of maximum surface and maximum texture width. - int tex_width = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture2DWidth, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(tex_width >= 0); - int surf_width = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture2DWidth, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(surf_width >= 0); - - int min = std::min(tex_width, surf_width); - - return getInfo(param_value_size, param_value, param_value_size_ret, min); - } - case PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: { - // Take the smaller of maximum surface and maximum texture height. - int tex_height = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture3DHeight, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(tex_height >= 0); - int surf_height = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&surf_height, - hipDeviceAttributeMaxTexture3DHeight, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(surf_height >= 0); - - int min = std::min(tex_height, surf_height); - - return getInfo(param_value_size, param_value, param_value_size_ret, min); - } - case PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH: { - // Take the smaller of maximum surface and maximum texture width. - int tex_width = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture3DWidth, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(tex_width >= 0); - int surf_width = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture3DWidth, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(surf_width >= 0); - - int min = std::min(tex_width, surf_width); - - return getInfo(param_value_size, param_value, param_value_size_ret, min); - } - case PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH: { - // Take the smaller of maximum surface and maximum texture depth. - int tex_depth = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&tex_depth, hipDeviceAttributeMaxTexture3DDepth, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(tex_depth >= 0); - int surf_depth = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&surf_depth, hipDeviceAttributeMaxTexture3DDepth, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(surf_depth >= 0); - - int min = std::min(tex_depth, surf_depth); - - return getInfo(param_value_size, param_value, param_value_size_ret, min); - } - case PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: { - // Take the smaller of maximum surface and maximum texture width. - int tex_width = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture1DWidth, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(tex_width >= 0); - int surf_width = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture1DWidth, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(surf_width >= 0); - - int min = std::min(tex_width, surf_width); - - return getInfo(param_value_size, param_value, param_value_size_ret, min); - } - case PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t(0)); - } - case PI_DEVICE_INFO_MAX_SAMPLERS: { - // This call is kind of meaningless for HIP, as samplers don't exist. - // Closest thing is textures, which is 128. - return getInfo(param_value_size, param_value, param_value_size_ret, 128u); - } - case PI_DEVICE_INFO_MAX_PARAMETER_SIZE: { - // __global__ function parameters are passed to the device via constant - // memory and are limited to 4 KB. - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t{4000u}); - } - case PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { - int mem_base_addr_align = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&mem_base_addr_align, - hipDeviceAttributeTextureAlignment, - device->get()) == hipSuccess); - // Multiply by 8 as clGetDeviceInfo returns this value in bits - mem_base_addr_align *= 8; - return getInfo(param_value_size, param_value, param_value_size_ret, - mem_base_addr_align); - } - case PI_DEVICE_INFO_HALF_FP_CONFIG: { - return getInfo(param_value_size, param_value, param_value_size_ret, 0u); - } - case PI_DEVICE_INFO_SINGLE_FP_CONFIG: { - auto config = PI_FP_DENORM | PI_FP_INF_NAN | PI_FP_ROUND_TO_NEAREST | - PI_FP_ROUND_TO_ZERO | PI_FP_ROUND_TO_INF | PI_FP_FMA | - PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; - return getInfo(param_value_size, param_value, param_value_size_ret, config); - } - case PI_DEVICE_INFO_DOUBLE_FP_CONFIG: { - auto config = PI_FP_DENORM | PI_FP_INF_NAN | PI_FP_ROUND_TO_NEAREST | - PI_FP_ROUND_TO_ZERO | PI_FP_ROUND_TO_INF | PI_FP_FMA; - return getInfo(param_value_size, param_value, param_value_size_ret, config); - } - case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE); - } - case PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: { - // The value is dohipmented for all existing GPUs in the HIP programming - // guidelines, section "H.3.2. Global Memory". - return getInfo(param_value_size, param_value, param_value_size_ret, 128u); - } - case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: { - int cache_size = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&cache_size, hipDeviceAttributeL2CacheSize, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(cache_size >= 0); - // The L2 cache is global to the GPU. - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(cache_size)); - } - case PI_DEVICE_INFO_GLOBAL_MEM_SIZE: { - size_t bytes = 0; - // Runtime API has easy access to this value, driver API info is scarse. - sycl::detail::pi::assertion(hipDeviceTotalMem(&bytes, device->get()) == - hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64{bytes}); - } - case PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { - unsigned int constant_memory = 0; - - // hipDeviceGetAttribute takes a int*, however the size of the constant - // memory on AMD GPU may be larger than what can fit in the positive part - // of a signed integer, so use an unsigned integer and cast the pointer to - // int*. - sycl::detail::pi::assertion( - hipDeviceGetAttribute(reinterpret_cast(&constant_memory), - hipDeviceAttributeTotalConstantMemory, - device->get()) == hipSuccess); - - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(constant_memory)); - } - case PI_DEVICE_INFO_MAX_CONSTANT_ARGS: { - // TODO: is there a way to retrieve this from HIP driver API? - // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX - // 1060 3GB - return getInfo(param_value_size, param_value, param_value_size_ret, 9u); - } - case PI_DEVICE_INFO_LOCAL_MEM_TYPE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_DEVICE_LOCAL_MEM_TYPE_LOCAL); - } - case PI_DEVICE_INFO_LOCAL_MEM_SIZE: { - // OpenCL's "local memory" maps most closely to HIP's "shared memory". - // HIP has its own definition of "local memory", which maps to OpenCL's - // "private memory". - int local_mem_size = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&local_mem_size, - hipDeviceAttributeMaxSharedMemoryPerBlock, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(local_mem_size >= 0); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(local_mem_size)); - } - case PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { - int ecc_enabled = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&ecc_enabled, hipDeviceAttributeEccEnabled, - device->get()) == hipSuccess); - - sycl::detail::pi::assertion((ecc_enabled == 0) | (ecc_enabled == 1)); - auto result = static_cast(ecc_enabled); - return getInfo(param_value_size, param_value, param_value_size_ret, result); - } - case PI_DEVICE_INFO_HOST_UNIFIED_MEMORY: { - int is_integrated = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&is_integrated, hipDeviceAttributeIntegrated, - device->get()) == hipSuccess); - - sycl::detail::pi::assertion((is_integrated == 0) | (is_integrated == 1)); - auto result = static_cast(is_integrated); - return getInfo(param_value_size, param_value, param_value_size_ret, result); - } - case PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: { - // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX - // 1060 3GB - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t{1000u}); - } - case PI_DEVICE_INFO_ENDIAN_LITTLE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_AVAILABLE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_COMPILER_AVAILABLE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_LINKER_AVAILABLE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_EXECUTION_CAPABILITIES: { - auto capability = PI_DEVICE_EXEC_CAPABILITIES_KERNEL; - return getInfo(param_value_size, param_value, param_value_size_ret, - capability); - } - case PI_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: { - // The mandated minimum capability: - auto capability = PI_QUEUE_FLAG_PROFILING_ENABLE | - PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; - return getInfo(param_value_size, param_value, param_value_size_ret, - capability); - } - case PI_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { - // The mandated minimum capability: - auto capability = PI_QUEUE_FLAG_PROFILING_ENABLE; - return getInfo(param_value_size, param_value, param_value_size_ret, - capability); - } - case PI_DEVICE_INFO_BUILT_IN_KERNELS: { - // An empty string is returned if no built-in kernels are supported by the - // device. - return getInfo(param_value_size, param_value, param_value_size_ret, ""); - } - case PI_DEVICE_INFO_PLATFORM: { - return getInfo(param_value_size, param_value, param_value_size_ret, - device->get_platform()); - } - case PI_DEVICE_INFO_NAME: { - static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u; - char name[MAX_DEVICE_NAME_LENGTH]; - sycl::detail::pi::assertion(hipDeviceGetName(name, MAX_DEVICE_NAME_LENGTH, - device->get()) == hipSuccess); - - // On AMD GPUs hipDeviceGetName returns an empty string, so return the arch - // name instead, this is also what AMD OpenCL devices return. - if (strlen(name) == 0) { - hipDeviceProp_t props; - sycl::detail::pi::assertion( - hipGetDeviceProperties(&props, device->get()) == hipSuccess); - - return getInfoArray(strlen(props.gcnArchName) + 1, param_value_size, - param_value, param_value_size_ret, props.gcnArchName); - } - return getInfoArray(strlen(name) + 1, param_value_size, param_value, - param_value_size_ret, name); - } - case PI_DEVICE_INFO_VENDOR: { - return getInfo(param_value_size, param_value, param_value_size_ret, - "AMD Corporation"); - } - case PI_DEVICE_INFO_DRIVER_VERSION: { - auto version = getHipVersionString(); - return getInfo(param_value_size, param_value, param_value_size_ret, - version.c_str()); - } - case PI_DEVICE_INFO_PROFILE: { - return getInfo(param_value_size, param_value, param_value_size_ret, "HIP"); - } - case PI_DEVICE_INFO_REFERENCE_COUNT: { - return getInfo(param_value_size, param_value, param_value_size_ret, - device->get_reference_count()); - } - case PI_DEVICE_INFO_VERSION: { - std::stringstream s; - - hipDeviceProp_t props; - sycl::detail::pi::assertion(hipGetDeviceProperties(&props, device->get()) == - hipSuccess); -#if defined(__HIP_PLATFORM_NVIDIA__) - s << props.major << "." << props.minor; -#elif defined(__HIP_PLATFORM_AMD__) - s << props.gcnArchName; -#else -#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); -#endif - - return getInfo(param_value_size, param_value, param_value_size_ret, - s.str().c_str()); - } - case PI_DEVICE_INFO_OPENCL_C_VERSION: { - return getInfo(param_value_size, param_value, param_value_size_ret, ""); - } - case PI_DEVICE_INFO_BACKEND_VERSION: { - // TODO: return some meaningful for backend_version below - return getInfo(param_value_size, param_value, param_value_size_ret, ""); - } - case PI_DEVICE_INFO_EXTENSIONS: { - // TODO: Remove comment when HIP support native asserts. - // DEVICELIB_ASSERT extension is set so fallback assert - // postprocessing is NOP. HIP 4.3 docs indicate support for - // native asserts are in progress - std::string SupportedExtensions = ""; - SupportedExtensions += PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT; - SupportedExtensions += " "; - - hipDeviceProp_t props; - sycl::detail::pi::assertion(hipGetDeviceProperties(&props, device->get()) == - hipSuccess); - if (props.arch.hasDoubles) { - SupportedExtensions += "cl_khr_fp64 "; - } - - return getInfo(param_value_size, param_value, param_value_size_ret, - SupportedExtensions.c_str()); - } - case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: { - // The minimum value for the FULL profile is 1 MB. - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t{1024u}); - } - case PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: { - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_TRUE); - } - case PI_DEVICE_INFO_PARENT_DEVICE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - nullptr); - } - case PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: { - return getInfo(param_value_size, param_value, param_value_size_ret, 0u); - } - case PI_DEVICE_INFO_PARTITION_PROPERTIES: { - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(0u)); - } - case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: { - return getInfo(param_value_size, param_value, param_value_size_ret, 0u); - } - case PI_DEVICE_INFO_PARTITION_TYPE: { - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(0u)); - } - - // Intel USM extensions - - case PI_DEVICE_INFO_USM_HOST_SUPPORT: { - // from cl_intel_unified_shared_memory: "The host memory access capabilities - // apply to any host allocation." - // - // query if/how the device can access page-locked host memory, possibly - // through PCIe, using the same pointer as the host - pi_bitfield value = {}; - // if (getAttribute(device, HIP_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) { - // the device shares a unified address space with the host - if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) { - // compute capability 6.x introduces operations that are atomic with - // respect to other CPUs and GPUs in the system - value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS | PI_USM_CONCURRENT_ACCESS | - PI_USM_CONCURRENT_ATOMIC_ACCESS; - } else { - // on GPU architectures with compute capability lower than 6.x, atomic - // operations from the GPU to CPU memory will not be atomic with respect - // to CPU initiated atomic operations - value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS; - } - //} - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - case PI_DEVICE_INFO_USM_DEVICE_SUPPORT: { - // from cl_intel_unified_shared_memory: - // "The device memory access capabilities apply to any device allocation - // associated with this device." - // - // query how the device can access memory allocated on the device itself (?) - pi_bitfield value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS | - PI_USM_CONCURRENT_ACCESS | - PI_USM_CONCURRENT_ATOMIC_ACCESS; - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - case PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: { - // from cl_intel_unified_shared_memory: - // "The single device shared memory access capabilities apply to any shared - // allocation associated with this device." - // - // query if/how the device can access managed memory associated to it - pi_bitfield value = {}; - if (getAttribute(device, hipDeviceAttributeManagedMemory)) { - // the device can allocate managed memory on this system - value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS; - } - if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) { - // the device can coherently access managed memory concurrently with the - // CPU - value |= PI_USM_CONCURRENT_ACCESS; - if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) { - // compute capability 6.x introduces operations that are atomic with - // respect to other CPUs and GPUs in the system - value |= PI_USM_CONCURRENT_ATOMIC_ACCESS; - } - } - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - case PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: { - // from cl_intel_unified_shared_memory: - // "The cross-device shared memory access capabilities apply to any shared - // allocation associated with this device, or to any shared memory - // allocation on another device that also supports the same cross-device - // shared memory access capability." - // - // query if/how the device can access managed memory associated to other - // devices - pi_bitfield value = {}; - if (getAttribute(device, hipDeviceAttributeManagedMemory)) { - // the device can allocate managed memory on this system - value |= PI_USM_ACCESS; - } - if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) { - // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS - // attribute can coherently access managed memory concurrently with the - // CPU - value |= PI_USM_CONCURRENT_ACCESS; - } - if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) { - // compute capability 6.x introduces operations that are atomic with - // respect to other CPUs and GPUs in the system - if (value & PI_USM_ACCESS) - value |= PI_USM_ATOMIC_ACCESS; - if (value & PI_USM_CONCURRENT_ACCESS) - value |= PI_USM_CONCURRENT_ATOMIC_ACCESS; - } - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - case PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { - // from cl_intel_unified_shared_memory: - // "The shared system memory access capabilities apply to any allocations - // made by a system allocator, such as malloc or new." - // - // query if/how the device can access pageable host memory allocated by the - // system allocator - pi_bitfield value = {}; - if (getAttribute(device, hipDeviceAttributePageableMemoryAccess)) { - // the link between the device and the host does not support native - // atomic operations - value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS; - } - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - - case PI_DEVICE_INFO_ATOMIC_64: { - // TODO: Reconsider it when AMD supports SYCL_USE_NATIVE_FP_ATOMICS. - hipDeviceProp_t props; - sycl::detail::pi::assertion(hipGetDeviceProperties(&props, device->get()) == - hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - props.arch.hasGlobalInt64Atomics && - props.arch.hasSharedInt64Atomics); - } - - case PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY: { - size_t FreeMemory = 0; - size_t TotalMemory = 0; - sycl::detail::pi::assertion(hipMemGetInfo(&FreeMemory, &TotalMemory) == - hipSuccess, - "failed hipMemGetInfo() API."); - return getInfo(param_value_size, param_value, param_value_size_ret, - FreeMemory); - } - - case PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE: { - int value = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&value, hipDeviceAttributeMemoryClockRate, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(value >= 0); - // Convert kilohertz to megahertz when returning. - return getInfo(param_value_size, param_value, param_value_size_ret, - value / 1000); - } - - case PI_EXT_INTEL_DEVICE_INFO_MEMORY_BUS_WIDTH: { - int value = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&value, hipDeviceAttributeMemoryBusWidth, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(value >= 0); - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - case PI_EXT_INTEL_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_int32{1}); - } - - case PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { - pi_memory_order_capabilities capabilities = PI_MEMORY_ORDER_RELAXED | - PI_MEMORY_ORDER_ACQUIRE | - PI_MEMORY_ORDER_RELEASE; - return getInfo(param_value_size, param_value, param_value_size_ret, - capabilities); - } - case PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: - case PI_EXT_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { - // SYCL2020 4.6.4.2 minimum mandated capabilities for - // atomic_fence/memory_scope_capabilities. - // Because scopes are hierarchical, wider scopes support all narrower - // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and - // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382) - pi_memory_scope_capabilities capabilities = PI_MEMORY_SCOPE_WORK_ITEM | - PI_MEMORY_SCOPE_SUB_GROUP | - PI_MEMORY_SCOPE_WORK_GROUP; - return getInfo(param_value_size, param_value, param_value_size_ret, - capabilities); - } - case PI_EXT_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { - // SYCL2020 4.6.4.2 minimum mandated capabilities for - // atomic_fence_order_capabilities. - pi_memory_order_capabilities capabilities = - PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE | - PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL; - return getInfo(param_value_size, param_value, param_value_size_ret, - capabilities); - } - - case PI_DEVICE_INFO_DEVICE_ID: { - int value = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&value, hipDeviceAttributePciDeviceId, - device->get()) == hipSuccess); - sycl::detail::pi::assertion(value >= 0); - return getInfo(param_value_size, param_value, param_value_size_ret, value); - } - - case PI_DEVICE_INFO_UUID: { -#if ((HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 2) || \ - HIP_VERSION_MAJOR > 5) - hipUUID uuid = {}; - // Supported since 5.2+ - sycl::detail::pi::assertion(hipDeviceGetUuid(&uuid, device->get()) == - hipSuccess); - std::array name; - std::copy(uuid.bytes, uuid.bytes + 16, name.begin()); - return getInfoArray(16, param_value_size, param_value, param_value_size_ret, - name.data()); -#endif - return PI_ERROR_INVALID_VALUE; - } - case PI_EXT_INTEL_DEVICE_INFO_MEM_CHANNEL_SUPPORT: { - // The mem-channel buffer property is not supported on HIP devices. - return getInfo(param_value_size, param_value, param_value_size_ret, - false); - } - case PI_DEVICE_INFO_IMAGE_SRGB: { - // The sRGB images are not supported on HIP device. - return getInfo(param_value_size, param_value, param_value_size_ret, - false); - } - - case PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: { - // Maximum number of 32-bit registers available to a thread block. - // Note: This number is shared by all thread blocks simultaneously resident - // on a multiprocessor. - int max_registers{-1}; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_registers, - hipDeviceAttributeMaxRegistersPerBlock, - device->get()) == hipSuccess); - - sycl::detail::pi::assertion(max_registers >= 0); - - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(max_registers)); - } - - case PI_DEVICE_INFO_PCI_ADDRESS: { - constexpr size_t AddressBufferSize = 13; - char AddressBuffer[AddressBufferSize]; - sycl::detail::pi::assertion( - hipDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) == - hipSuccess); - // A typical PCI address is 12 bytes + \0: "1234:67:90.2", but the HIP API is not - // guaranteed to use this format. In practice, it uses this format, at least - // in 5.3-5.5. To be on the safe side, we make sure the terminating \0 is set. - AddressBuffer[AddressBufferSize - 1] = '\0'; - sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) > 0); - return getInfoArray(strnlen(AddressBuffer, AddressBufferSize - 1) + 1, - param_value_size, param_value, param_value_size_ret, - AddressBuffer); - } - // TODO: Investigate if this information is available on HIP. - case PI_DEVICE_INFO_GPU_EU_COUNT: - case PI_DEVICE_INFO_GPU_EU_SIMD_WIDTH: - case PI_DEVICE_INFO_GPU_SLICES: - case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: - case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: - case PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU: - case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH: - case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: - case PI_EXT_ONEAPI_DEVICE_INFO_CUDA_ASYNC_BARRIER: - setErrorMessage("HIP backend does not support this query", - PI_ERROR_INVALID_ARG_VALUE); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - sycl::detail::pi::die("Device info request not implemented"); - return {}; -} - -/// Gets the native HIP handle of a PI device object -/// -/// \param[in] device The PI device to get the native HIP object of. -/// \param[out] nativeHandle Set to the native handle of the PI device object. -/// -/// \return PI_SUCCESS -pi_result hip_piextDeviceGetNativeHandle(pi_device device, - pi_native_handle *nativeHandle) { - *nativeHandle = static_cast(device->get()); - return PI_SUCCESS; -} - -/// Created a PI device object from a HIP device handle. -/// TODO: Implement this. -/// NOTE: The created PI object takes ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI device object from. -/// \param[in] platform is the PI platform of the device. -/// \param[out] device Set to the PI device object created from native handle. -/// -/// \return TBD -pi_result hip_piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle, - pi_platform platform, - pi_device *device) { - (void)nativeHandle; - (void)platform; - (void)device; - sycl::detail::pi::die( - "Creation of PI device from native handle not implemented"); - return {}; -} - -/* Context APIs */ - -/// Create a PI HIP context. -/// -/// By default creates a scoped context and keeps the last active HIP context -/// on top of the HIP context stack. -/// With the __SYCL_PI_CONTEXT_PROPERTIES_HIP_PRIMARY key/id and a value of -/// PI_TRUE creates a primary HIP context and activates it on the HIP context -/// stack. -/// -/// \param[in] properties 0 terminated array of key/id-value combinations. Can -/// be nullptr. Only accepts property key/id -/// __SYCL_PI_CONTEXT_PROPERTIES_HIP_PRIMARY with a pi_bool value. -/// \param[in] num_devices Number of devices to create the context for. -/// \param[in] devices Devices to create the context for. -/// \param[in] pfn_notify Callback, currently unused. -/// \param[in] user_data User data for callback. -/// \param[out] retcontext Set to created context on success. -/// -/// \return PI_SUCCESS on success, otherwise an error return code. -pi_result hip_piContextCreate( - const pi_context_properties *properties, - [[maybe_unused]] pi_uint32 num_devices, const pi_device *devices, - [[maybe_unused]] void (*pfn_notify)(const char *errinfo, - const void *private_info, size_t cb, - [[maybe_unused]] void *user_data), - [[maybe_unused]] void *user_data, pi_context *retcontext) { - - assert(devices != nullptr); - // TODO: How to implement context callback? - assert(pfn_notify == nullptr); - assert(user_data == nullptr); - assert(num_devices == 1); - // Need input context - assert(retcontext != nullptr); - pi_result errcode_ret = PI_SUCCESS; - - // Parse properties. - bool property_hip_primary = false; - while (properties && (0 != *properties)) { - // Consume property ID. - pi_context_properties id = *properties; - ++properties; - // Consume property value. - pi_context_properties value = *properties; - ++properties; - switch (id) { - case __SYCL_PI_CONTEXT_PROPERTIES_HIP_PRIMARY: - assert(value == PI_FALSE || value == PI_TRUE); - property_hip_primary = static_cast(value); - break; - default: - // Unknown property. - sycl::detail::pi::die( - "Unknown piContextCreate property in property list"); - return PI_ERROR_INVALID_VALUE; - } - } - - std::unique_ptr<_pi_context> piContextPtr{nullptr}; - try { - hipCtx_t current = nullptr; - - if (property_hip_primary) { - // Use the HIP primary context and assume that we want to use it - // immediately as we want to forge context switches. - hipCtx_t Ctxt; - errcode_ret = - PI_CHECK_ERROR(hipDevicePrimaryCtxRetain(&Ctxt, devices[0]->get())); - piContextPtr = std::unique_ptr<_pi_context>( - new _pi_context{_pi_context::kind::primary, Ctxt, *devices}); - errcode_ret = PI_CHECK_ERROR(hipCtxPushCurrent(Ctxt)); - } else { - // Create a scoped context. - hipCtx_t newContext; - PI_CHECK_ERROR(hipCtxGetCurrent(¤t)); - errcode_ret = PI_CHECK_ERROR( - hipCtxCreate(&newContext, hipDeviceMapHost, devices[0]->get())); - piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{ - _pi_context::kind::user_defined, newContext, *devices}); - } - - static std::once_flag initFlag; - std::call_once( - initFlag, - [](pi_result &) { - // Use default stream to record base event counter - PI_CHECK_ERROR( - hipEventCreateWithFlags(&_pi_platform::evBase_, hipEventDefault)); - PI_CHECK_ERROR(hipEventRecord(_pi_platform::evBase_, 0)); - }, - errcode_ret); - - // For non-primary scoped contexts keep the last active on top of the stack - // as `cuCtxCreate` replaces it implicitly otherwise. - // Primary contexts are kept on top of the stack, so the previous context - // is not queried and therefore not recovered. - if (current != nullptr) { - PI_CHECK_ERROR(hipCtxSetCurrent(current)); - } - - *retcontext = piContextPtr.release(); - } catch (pi_result err) { - errcode_ret = err; - } catch (...) { - errcode_ret = PI_ERROR_OUT_OF_RESOURCES; - } - return errcode_ret; -} - -pi_result hip_piContextRelease(pi_context ctxt) { - - assert(ctxt != nullptr); - - if (ctxt->decrement_reference_count() > 0) { - return PI_SUCCESS; - } - ctxt->invoke_extended_deleters(); - - std::unique_ptr<_pi_context> context{ctxt}; - - if (!ctxt->is_primary()) { - hipCtx_t hipCtxt = ctxt->get(); - // hipCtxSynchronize is not supported for AMD platform so we can just - // destroy the context, for NVIDIA make sure it's synchronized. -#if defined(__HIP_PLATFORM_NVIDIA__) - hipCtx_t current = nullptr; - PI_CHECK_ERROR(hipCtxGetCurrent(¤t)); - if (hipCtxt != current) { - PI_CHECK_ERROR(hipCtxPushCurrent(hipCtxt)); - } - PI_CHECK_ERROR(hipCtxSynchronize()); - PI_CHECK_ERROR(hipCtxGetCurrent(¤t)); - if (hipCtxt == current) { - PI_CHECK_ERROR(hipCtxPopCurrent(¤t)); - } -#endif - return PI_CHECK_ERROR(hipCtxDestroy(hipCtxt)); - } else { - // Primary context is not destroyed, but released - hipDevice_t hipDev = ctxt->get_device()->get(); - hipCtx_t current; - PI_CHECK_ERROR(hipCtxPopCurrent(¤t)); - return PI_CHECK_ERROR(hipDevicePrimaryCtxRelease(hipDev)); - } - - hipCtx_t hipCtxt = ctxt->get(); - return PI_CHECK_ERROR(hipCtxDestroy(hipCtxt)); -} - -/// Gets the native HIP handle of a PI context object -/// -/// \param[in] context The PI context to get the native HIP object of. -/// \param[out] nativeHandle Set to the native handle of the PI context object. -/// -/// \return PI_SUCCESS -pi_result hip_piextContextGetNativeHandle(pi_context context, - pi_native_handle *nativeHandle) { - *nativeHandle = reinterpret_cast(context->get()); - return PI_SUCCESS; -} - -/// Created a PI context object from a HIP context handle. -/// TODO: Implement this. -/// NOTE: The created PI object takes ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI context object from. -/// \param[out] context Set to the PI context object created from native handle. -/// -/// \return TBD -pi_result hip_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle, - pi_uint32 num_devices, - const pi_device *devices, - bool ownNativeHandle, - pi_context *context) { - (void)nativeHandle; - (void)num_devices; - (void)devices; - (void)ownNativeHandle; - (void)context; - sycl::detail::pi::die( - "Creation of PI context from native handle not implemented"); - return {}; -} - /// Creates a PI Memory object using a HIP memory allocation. /// Can trigger a manual copy depending on the mode. /// \TODO Implement USE_HOST_PTR using cuHostRegister @@ -3046,13 +1615,13 @@ pi_result hip_piEnqueueKernelLaunch( bool providedLocalWorkGroupSize = (local_work_size != nullptr); { - pi_result retError = hip_piDeviceGetInfo( + pi_result retError = pi2ur::piDeviceGetInfo( command_queue->device_, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES, sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr); assert(retError == PI_SUCCESS); (void)retError; - retError = hip_piDeviceGetInfo( + retError = pi2ur::piDeviceGetInfo( command_queue->device_, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE, sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr); assert(retError == PI_SUCCESS); @@ -5497,10 +4066,11 @@ pi_result hip_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, // the same index std::vector platforms; platforms.resize(device_idx + 1); - result = hip_piPlatformsGet(device_idx + 1, platforms.data(), nullptr); + result = pi2ur::piPlatformsGet(device_idx + 1, platforms.data(), nullptr); // get the device from the platform - pi_device device = platforms[device_idx]->devices_[0].get(); + pi_device device = + reinterpret_cast(platforms[device_idx]->devices_[0].get()); return getInfo(param_value_size, param_value, param_value_size_ret, device); } @@ -5591,18 +4161,6 @@ pi_result hip_piextEnqueueWriteHostPipe( return {}; } -// This API is called by Sycl RT to notify the end of the plugin lifetime. -// Windows: dynamically loaded plugins might have been unloaded already -// when this is called. Sycl RT holds onto the PI plugin so it can be -// called safely. But this is not transitive. If the PI plugin in turn -// dynamically loaded a different DLL, that may have been unloaded. -// TODO: add a global variable lifetime management code here (see -// pi_level_zero.cpp for reference) Currently this is just a NOOP. -pi_result hip_piTearDown(void *PluginParameter) { - (void)PluginParameter; - return PI_SUCCESS; -} - pi_result hip_piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, uint64_t *HostTime) { if (!DeviceTime && !HostTime) @@ -5627,8 +4185,8 @@ pi_result hip_piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, PI_CHECK_ERROR(hipEventSynchronize(event)); float elapsedTime = 0.0f; - PI_CHECK_ERROR( - hipEventElapsedTime(&elapsedTime, _pi_platform::evBase_, event)); + PI_CHECK_ERROR(hipEventElapsedTime(&elapsedTime, + ur_platform_handle_t_::evBase_, event)); *DeviceTime = (uint64_t)(elapsedTime * (double)1e6); } return PI_SUCCESS; @@ -5656,28 +4214,28 @@ pi_result piPluginInit(pi_plugin *PluginInit) { (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&hip_api); // Platform - _PI_CL(piPlatformsGet, hip_piPlatformsGet) - _PI_CL(piPlatformGetInfo, hip_piPlatformGetInfo) + _PI_CL(piPlatformsGet, pi2ur::piPlatformsGet) + _PI_CL(piPlatformGetInfo, pi2ur::piPlatformGetInfo) // Device - _PI_CL(piDevicesGet, hip_piDevicesGet) - _PI_CL(piDeviceGetInfo, hip_piDeviceGetInfo) - _PI_CL(piDevicePartition, hip_piDevicePartition) - _PI_CL(piDeviceRetain, hip_piDeviceRetain) - _PI_CL(piDeviceRelease, hip_piDeviceRelease) + _PI_CL(piDevicesGet, pi2ur::piDevicesGet) + _PI_CL(piDeviceGetInfo, pi2ur::piDeviceGetInfo) + _PI_CL(piDevicePartition, pi2ur::piDevicePartition) + _PI_CL(piDeviceRetain, pi2ur::piDeviceRetain) + _PI_CL(piDeviceRelease, pi2ur::piDeviceRelease) _PI_CL(piextDeviceSelectBinary, hip_piextDeviceSelectBinary) _PI_CL(piextGetDeviceFunctionPointer, hip_piextGetDeviceFunctionPointer) - _PI_CL(piextDeviceGetNativeHandle, hip_piextDeviceGetNativeHandle) + _PI_CL(piextDeviceGetNativeHandle, pi2ur::piextDeviceGetNativeHandle) _PI_CL(piextDeviceCreateWithNativeHandle, - hip_piextDeviceCreateWithNativeHandle) + pi2ur::piextDeviceCreateWithNativeHandle) // Context - _PI_CL(piextContextSetExtendedDeleter, hip_piextContextSetExtendedDeleter) - _PI_CL(piContextCreate, hip_piContextCreate) - _PI_CL(piContextGetInfo, hip_piContextGetInfo) - _PI_CL(piContextRetain, hip_piContextRetain) - _PI_CL(piContextRelease, hip_piContextRelease) - _PI_CL(piextContextGetNativeHandle, hip_piextContextGetNativeHandle) + _PI_CL(piextContextSetExtendedDeleter, pi2ur::piextContextSetExtendedDeleter) + _PI_CL(piContextCreate, pi2ur::piContextCreate) + _PI_CL(piContextGetInfo, pi2ur::piContextGetInfo) + _PI_CL(piContextRetain, pi2ur::piContextRetain) + _PI_CL(piContextRelease, pi2ur::piContextRelease) + _PI_CL(piextContextGetNativeHandle, pi2ur::piextContextGetNativeHandle) _PI_CL(piextContextCreateWithNativeHandle, - hip_piextContextCreateWithNativeHandle) + pi2ur::piextContextCreateWithNativeHandle) // Queue _PI_CL(piQueueCreate, hip_piQueueCreate) _PI_CL(piextQueueCreate, hip_piextQueueCreate) @@ -5784,7 +4342,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextKernelSetArgMemObj, hip_piextKernelSetArgMemObj) _PI_CL(piextKernelSetArgSampler, hip_piextKernelSetArgSampler) _PI_CL(piPluginGetLastError, hip_piPluginGetLastError) - _PI_CL(piTearDown, hip_piTearDown) + _PI_CL(piTearDown, pi2ur::piTearDown) _PI_CL(piGetDeviceAndHostTimer, hip_piGetDeviceAndHostTimer) _PI_CL(piPluginGetBackendOption, hip_piPluginGetBackendOption) @@ -5800,5 +4358,3 @@ pi_result piPluginInit(pi_plugin *PluginInit) { #endif } // extern "C" - -hipEvent_t _pi_platform::evBase_{nullptr}; diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index 4b3c5f53298d9..9281228c3c85e 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -39,13 +39,15 @@ #include #include +#include +#include +#include + +#include "pi2ur.hpp" + extern "C" { /// \cond INGORE_BLOCK_IN_DOXYGEN -pi_result hip_piContextRetain(pi_context); -pi_result hip_piContextRelease(pi_context); -pi_result hip_piDeviceRelease(pi_device); -pi_result hip_piDeviceRetain(pi_device); pi_result hip_piProgramRetain(pi_program); pi_result hip_piProgramRelease(pi_program); pi_result hip_piQueueRelease(pi_queue); @@ -64,9 +66,8 @@ using _pi_stream_guard = std::unique_lock; /// available devices since initialization is done /// when devices are used. /// -struct _pi_platform { - static hipEvent_t evBase_; // HIP event used as base counter - std::vector> devices_; +struct _pi_platform : ur_platform_handle_t_ { + using ur_platform_handle_t_::ur_platform_handle_t_; }; /// PI device mapping to a hipDevice_t. @@ -74,28 +75,8 @@ struct _pi_platform { /// and implements the reference counting semantics since /// HIP objects are not refcounted. /// -struct _pi_device { -private: - using native_type = hipDevice_t; - - native_type cuDevice_; - std::atomic_uint32_t refCount_; - pi_platform platform_; - pi_context context_; - -public: - _pi_device(native_type cuDevice, pi_platform platform) - : cuDevice_(cuDevice), refCount_{1}, platform_(platform) {} - - native_type get() const noexcept { return cuDevice_; }; - - pi_uint32 get_reference_count() const noexcept { return refCount_; } - - pi_platform get_platform() const noexcept { return platform_; }; - - void set_context(pi_context ctx) { context_ = ctx; }; - - pi_context get_context() { return context_; }; +struct _pi_device : ur_device_handle_t_ { + using ur_device_handle_t_::ur_device_handle_t_; }; /// PI context mapping to a HIP context object. @@ -136,58 +117,8 @@ struct _pi_device { /// called upon destruction of the PI Context. /// See proposal for details. /// -struct _pi_context { - - struct deleter_data { - pi_context_extended_deleter function; - void *user_data; - - void operator()() { function(user_data); } - }; - - using native_type = hipCtx_t; - - enum class kind { primary, user_defined } kind_; - native_type hipContext_; - _pi_device *deviceId_; - std::atomic_uint32_t refCount_; - - _pi_context(kind k, hipCtx_t ctxt, _pi_device *devId) - : kind_{k}, hipContext_{ctxt}, deviceId_{devId}, refCount_{1} { - deviceId_->set_context(this); - hip_piDeviceRetain(deviceId_); - }; - - ~_pi_context() { hip_piDeviceRelease(deviceId_); } - - void invoke_extended_deleters() { - std::lock_guard guard(mutex_); - for (auto &deleter : extended_deleters_) { - deleter(); - } - } - - void set_extended_deleter(pi_context_extended_deleter function, - void *user_data) { - std::lock_guard guard(mutex_); - extended_deleters_.emplace_back(deleter_data{function, user_data}); - } - - pi_device get_device() const noexcept { return deviceId_; } - - native_type get() const noexcept { return hipContext_; } - - bool is_primary() const noexcept { return kind_ == kind::primary; } - - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } - -private: - std::mutex mutex_; - std::vector extended_deleters_; +struct _pi_context : ur_context_handle_t_ { + using ur_context_handle_t_::ur_context_handle_t_; }; /// PI Mem mapping to HIP memory allocations, both data and texture/surface. @@ -329,7 +260,7 @@ struct _pi_mem { if (is_sub_buffer()) { hip_piMemRetain(mem_.buffer_mem_.parent_); } else { - hip_piContextRetain(context_); + pi2ur::piContextRetain(context_); } }; @@ -341,7 +272,7 @@ struct _pi_mem { mem_.surface_mem_.array_ = array; mem_.surface_mem_.imageType_ = image_type; mem_.surface_mem_.surfObj_ = surf; - hip_piContextRetain(context_); + pi2ur::piContextRetain(context_); } ~_pi_mem() { @@ -351,7 +282,7 @@ struct _pi_mem { return; } } - hip_piContextRelease(context_); + pi2ur::piContextRelease(context_); } // TODO: Move as many shared funcs up as possible @@ -425,13 +356,13 @@ struct _pi_queue { num_compute_streams_{0}, num_transfer_streams_{0}, last_sync_compute_streams_{0}, last_sync_transfer_streams_{0}, flags_(flags) { - hip_piContextRetain(context_); - hip_piDeviceRetain(device_); + pi2ur::piContextRetain(context_); + pi2ur::piDeviceRetain(device_); } ~_pi_queue() { - hip_piContextRelease(context_); - hip_piDeviceRelease(device_); + pi2ur::piContextRelease(context_); + pi2ur::piDeviceRelease(device_); } void compute_stream_wait_for_barrier_if_needed(hipStream_t stream, @@ -873,7 +804,7 @@ struct _pi_kernel { : function_{func}, functionWithOffsetParam_{funcWithOffsetParam}, name_{name}, context_{ctxt}, program_{program}, refCount_{1} { hip_piProgramRetain(program_); - hip_piContextRetain(context_); + pi2ur::piContextRetain(context_); } _pi_kernel(hipFunction_t func, const char *name, pi_program program, @@ -882,7 +813,7 @@ struct _pi_kernel { ~_pi_kernel() { hip_piProgramRelease(program_); - hip_piContextRelease(context_); + pi2ur::piContextRelease(context_); } pi_program get_program() const noexcept { return program_; } diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 318059da5eaf9..5e47ee42c2cda 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -169,6 +169,43 @@ if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS) ) endif() +if ("hip" IN_LIST SYCL_ENABLE_PLUGINS) + # Build HIP adapter + add_sycl_library("ur_adapter_hip" SHARED + SOURCES + "ur/ur.hpp" + "ur/ur.cpp" + "ur/usm_allocator.cpp" + "ur/usm_allocator.hpp" + "ur/adapters/hip/common.cpp" + "ur/adapters/hip/common.hpp" + "ur/adapters/hip/context.cpp" + "ur/adapters/hip/context.hpp" + "ur/adapters/hip/device.cpp" + "ur/adapters/hip/device.hpp" + "ur/adapters/hip/platform.cpp" + "ur/adapters/hip/platform.hpp" + "ur/adapters/hip/ur_interface_loader.cpp" + INCLUDE_DIRS + ${sycl_inc_dir} + LIBRARIES + UnifiedRuntime-Headers + Threads::Threads + ) + + if("${SYCL_BUILD_PI_HIP_PLATFORM}" STREQUAL "AMD") + target_link_libraries(ur_adapter_hip PUBLIC rocmdrv) + # Set HIP define to select AMD platform + target_compile_definitions(ur_adapter_hip PRIVATE __HIP_PLATFORM_AMD__) + elseif("${SYCL_BUILD_PI_HIP_PLATFORM}" STREQUAL "NVIDIA") + target_link_libraries(ur_adapter_hip PUBLIC cudadrv cudart) + # Set HIP define to select NVIDIA platform + target_compile_definitions(ur_adapter_hip PRIVATE __HIP_PLATFORM_NVIDIA__) + else() + message(FATAL_ERROR "Unspecified PI HIP platform please set SYCL_BUILD_PI_HIP_PLATFORM to 'AMD' or 'NVIDIA'") + endif() +endif() + if (TARGET UnifiedRuntimeLoader) set_target_properties(hello_world PROPERTIES EXCLUDE_FROM_ALL 1 EXCLUDE_FROM_DEFAULT_BUILD 1) # Install the UR loader. diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index fbff734ecaf60..432baa3224f31 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -200,7 +200,7 @@ inline pi_result ur2piPlatformInfoValue(ur_platform_info_t ParamName, case UR_PLATFORM_BACKEND_CUDA: return PI_EXT_PLATFORM_BACKEND_CUDA; case UR_PLATFORM_BACKEND_HIP: - return PI_EXT_PLATFORM_BACKEND_CUDA; + return PI_EXT_PLATFORM_BACKEND_HIP; default: die("UR_PLATFORM_INFO_BACKEND: unhandled value"); } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp new file mode 100644 index 0000000000000..28777e9e9d085 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp @@ -0,0 +1,84 @@ +//===--------- common.cpp - HIP Adapter -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#include "common.hpp" + +#include + +ur_result_t map_error_ur(hipError_t result) { + switch (result) { + case hipSuccess: + return UR_RESULT_SUCCESS; + case hipErrorInvalidContext: + return UR_RESULT_ERROR_INVALID_CONTEXT; + case hipErrorInvalidDevice: + return UR_RESULT_ERROR_INVALID_DEVICE; + case hipErrorInvalidValue: + return UR_RESULT_ERROR_INVALID_VALUE; + case hipErrorOutOfMemory: + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + case hipErrorLaunchOutOfResources: + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + default: + return UR_RESULT_ERROR_UNKNOWN; + } +} + +ur_result_t check_error_ur(hipError_t result, const char *function, int line, + const char *file) { + if (result == hipSuccess) { + return UR_RESULT_SUCCESS; + } + + if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) { + const char *errorString = nullptr; + const char *errorName = nullptr; + errorName = hipGetErrorName(result); + errorString = hipGetErrorString(result); + std::stringstream ss; + ss << "\nUR HIP ERROR:" + << "\n\tValue: " << result + << "\n\tName: " << errorName + << "\n\tDescription: " << errorString + << "\n\tFunction: " << function << "\n\tSource Location: " << file + << ":" << line << "\n" + << std::endl; + std::cerr << ss.str(); + } + + if (std::getenv("PI_HIP_ABORT") != nullptr) { + std::abort(); + } + + throw map_error_ur(result); +} + +std::string getHipVersionString() { + int driver_version = 0; + if (hipDriverGetVersion(&driver_version) != hipSuccess) { + return ""; + } + // The version is returned as (1000 major + 10 minor). + std::stringstream stream; + stream << "HIP " << driver_version / 1000 << "." + << driver_version % 1000 / 10; + return stream.str(); +} + +void sycl::detail::ur::die(const char *Message) { + std::cerr << "ur_die: " << Message << std::endl; + std::terminate(); +} + +void sycl::detail::ur::assertion(bool Condition, const char *Message) { + if (!Condition) + die(Message); +} + +void sycl::detail::ur::hipPrint(const char *Message) { + std::cerr << "ur_print: " << Message << std::endl; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp new file mode 100644 index 0000000000000..2d32ebf0abd34 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp @@ -0,0 +1,99 @@ +//===--------- common.hpp - HIP Adapter -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include +#include +#include + +// Hipify doesn't support cuArrayGetDescriptor, on AMD the hipArray can just be +// indexed, but on NVidia it is an opaque type and needs to go through +// cuArrayGetDescriptor so implement a utility function to get the array +// properties +inline void getArrayDesc(hipArray *array, hipArray_Format &format, + size_t &channels) { +#if defined(__HIP_PLATFORM_AMD__) + format = array->Format; + channels = array->NumChannels; +#elif defined(__HIP_PLATFORM_NVIDIA__) + CUDA_ARRAY_DESCRIPTOR arrayDesc; + cuArrayGetDescriptor(&arrayDesc, (CUarray)array); + + format = arrayDesc.Format; + channels = arrayDesc.NumChannels; +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif +} + +// NVidia HIP headers guard hipArray3DCreate behind __CUDACC__, this does not +// seem to be required and we're not using nvcc to build the UR HIP adapter so +// add the translation function here +#if defined(__HIP_PLATFORM_NVIDIA__) && !defined(__CUDACC__) +inline static hipError_t +hipArray3DCreate(hiparray *pHandle, + const HIP_ARRAY3D_DESCRIPTOR *pAllocateArray) { + return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray)); +} +#endif + +// hipArray gets turned into cudaArray when using the HIP NVIDIA platform, and +// some CUDA APIs use cudaArray* and others use CUarray, these two represent the +// same type, however when building cudaArray appears as an opaque type, so it +// needs to be explicitly casted to CUarray. In order for this to work for both +// AMD and NVidia we introduce an second hipArray type that will be CUarray for +// NVIDIA and hipArray* for AMD so that we can place the explicit casts when +// necessary for NVIDIA and they will be no-ops for AMD. +#if defined(__HIP_PLATFORM_NVIDIA__) +typedef CUarray hipCUarray; +#elif defined(__HIP_PLATFORM_AMD__) +typedef hipArray *hipCUarray; +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif + +// Add missing HIP to CUDA defines +#if defined(__HIP_PLATFORM_NVIDIA__) +#define hipMemoryType CUmemorytype +#define hipMemoryTypeHost CU_MEMORYTYPE_HOST +#define hipMemoryTypeDevice CU_MEMORYTYPE_DEVICE +#define hipMemoryTypeArray CU_MEMORYTYPE_ARRAY +#define hipMemoryTypeUnified CU_MEMORYTYPE_UNIFIED +#endif + +ur_result_t map_error_ur(hipError_t result); + +ur_result_t check_error_ur(hipError_t result, const char *function, int line, + const char *file); + +#define UR_CHECK_ERROR(result) \ + check_error_ur(result, __func__, __LINE__, __FILE__) + +std::string getHipVersionString(); + +/// ------ Error handling, matching OpenCL plugin semantics. +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { +namespace detail { +namespace ur { + +// Report error and no return (keeps compiler from printing warnings). +// TODO: Probably change that to throw a catchable exception, +// but for now it is useful to see every failure. +// +[[noreturn]] void die(const char *Message); + +// Reports error messages +void hipPrint(const char *Message); + +void assertion(bool Condition, const char *Message = nullptr); + +} // namespace ur +} // namespace detail +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp new file mode 100644 index 0000000000000..e3949881c4879 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp @@ -0,0 +1,185 @@ +//===--------- context.cpp - HIP Adapter ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "context.hpp" + +/// Create a UR HIP context. +/// +/// By default creates a scoped context and keeps the last active HIP context +/// on top of the HIP context stack. +/// +UR_APIEXPORT ur_result_t UR_APICALL +urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, + const ur_context_properties_t *pProperties, + ur_context_handle_t *phContext) { + UR_ASSERT(phDevices, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(phContext, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + assert(DeviceCount == 1); + ur_result_t errcode_ret = UR_RESULT_SUCCESS; + + std::unique_ptr urContextPtr{nullptr}; + try { + hipCtx_t current = nullptr; + + // Create a scoped context. + hipCtx_t newContext; + UR_CHECK_ERROR(hipCtxGetCurrent(¤t)); + errcode_ret = UR_CHECK_ERROR( + hipCtxCreate(&newContext, hipDeviceMapHost, phDevices[0]->get())); + urContextPtr = + std::unique_ptr(new ur_context_handle_t_{ + ur_context_handle_t_::kind::user_defined, newContext, *phDevices}); + + static std::once_flag initFlag; + std::call_once( + initFlag, + [](ur_result_t &err) { + // Use default stream to record base event counter + UR_CHECK_ERROR(hipEventCreateWithFlags( + &ur_platform_handle_t_::evBase_, hipEventDefault)); + UR_CHECK_ERROR(hipEventRecord(ur_platform_handle_t_::evBase_, 0)); + }, + errcode_ret); + + // For non-primary scoped contexts keep the last active on top of the stack + // as `cuCtxCreate` replaces it implicitly otherwise. + // Primary contexts are kept on top of the stack, so the previous context + // is not queried and therefore not recovered. + if (current != nullptr) { + UR_CHECK_ERROR(hipCtxSetCurrent(current)); + } + + *phContext = urContextPtr.release(); + } catch (ur_result_t err) { + errcode_ret = err; + } catch (...) { + errcode_ret = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + return errcode_ret; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( + ur_context_handle_t hContext, ur_context_info_t ContextInfoType, + size_t propSize, void *pContextInfo, size_t *pPropSizeRet) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + UrReturnHelper ReturnValue(propSize, pContextInfo, pPropSizeRet); + + switch (uint32_t{ContextInfoType}) { + case UR_CONTEXT_INFO_NUM_DEVICES: + return ReturnValue(1); + case UR_CONTEXT_INFO_DEVICES: + return ReturnValue(hContext->get_device()); + case UR_CONTEXT_INFO_REFERENCE_COUNT: + return ReturnValue(hContext->get_reference_count()); + case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: + case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: + case UR_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: + case UR_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { + // These queries should be dealt with in context_impl.cpp by calling the + // queries of each device separately and building the intersection set. + setErrorMessage("These queries should have never come here.", + UR_RESULT_ERROR_INVALID_ARGUMENT); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: + // 2D USM memcpy is supported. + return ReturnValue(true); + case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT: + // 2D USM operations currently not supported. + return ReturnValue(false); + + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextRelease(ur_context_handle_t ctxt) { + UR_ASSERT(ctxt, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + if (ctxt->decrement_reference_count() > 0) { + return UR_RESULT_SUCCESS; + } + ctxt->invoke_extended_deleters(); + + std::unique_ptr context{ctxt}; + + if (!ctxt->is_primary()) { + hipCtx_t hipCtxt = ctxt->get(); + // hipCtxSynchronize is not supported for AMD platform so we can just + // destroy the context, for NVIDIA make sure it's synchronized. +#if defined(__HIP_PLATFORM_NVIDIA__) + hipCtx_t current = nullptr; + UR_CHECK_ERROR(hipCtxGetCurrent(¤t)); + if (hipCtxt != current) { + UR_CHECK_ERROR(hipCtxPushCurrent(hipCtxt)); + } + UR_CHECK_ERROR(hipCtxSynchronize()); + UR_CHECK_ERROR(hipCtxGetCurrent(¤t)); + if (hipCtxt == current) { + UR_CHECK_ERROR(hipCtxPopCurrent(¤t)); + } +#endif + return UR_CHECK_ERROR(hipCtxDestroy(hipCtxt)); + } else { + // Primary context is not destroyed, but released + hipDevice_t hipDev = ctxt->get_device()->get(); + hipCtx_t current; + UR_CHECK_ERROR(hipCtxPopCurrent(¤t)); + return UR_CHECK_ERROR(hipDevicePrimaryCtxRelease(hipDev)); + } + + hipCtx_t hipCtxt = ctxt->get(); + return UR_CHECK_ERROR(hipCtxDestroy(hipCtxt)); +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextRetain(ur_context_handle_t ctxt) { + UR_ASSERT(ctxt, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + assert(ctxt->get_reference_count() > 0); + + ctxt->increment_reference_count(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( + ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phNativeContext, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + *phNativeContext = reinterpret_cast(hContext->get()); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( + ur_native_handle_t hNativeContext, uint32_t numDevices, + const ur_device_handle_t *phDevices, + const ur_context_native_properties_t *pProperties, + ur_context_handle_t *phContext) { + (void)hNativeContext; + (void)phContext; + + // TODO(ur): Needed for the conformance test to pass, but it may be valid + // to have a null CUDA context + UR_ASSERT(hNativeContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + return UR_RESULT_ERROR_INVALID_OPERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter( + ur_context_handle_t hContext, ur_context_extended_deleter_t pfnDeleter, + void *pUserData) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pfnDeleter, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + hContext->set_extended_deleter(pfnDeleter, pUserData); + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp new file mode 100644 index 0000000000000..da634c3dbe474 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp @@ -0,0 +1,122 @@ +//===--------- context.hpp - HIP Adapter ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include "common.hpp" +#include "device.hpp" +#include "platform.hpp" + +// We need this declaration temporarily while UR and PI share ScopedContext +class _pi_context; +using pi_context = _pi_context *; + +typedef void (*ur_context_extended_deleter_t)(void *user_data); + +struct ur_context_handle_t_ : public _ur_object { + + struct deleter_data { + ur_context_extended_deleter_t function; + void *user_data; + + void operator()() { function(user_data); } + }; + + using native_type = hipCtx_t; + + enum class kind { primary, user_defined } kind_; + native_type hipContext_; + ur_device_handle_t deviceId_; + std::atomic_uint32_t refCount_; + + ur_context_handle_t_(kind k, hipCtx_t ctxt, ur_device_handle_t devId) + : kind_{k}, hipContext_{ctxt}, deviceId_{devId}, refCount_{1} { + deviceId_->set_context(this); + urDeviceRetain(deviceId_); + }; + + ~ur_context_handle_t_() { urDeviceRelease(deviceId_); } + + void invoke_extended_deleters() { + std::lock_guard guard(mutex_); + for (auto &deleter : extended_deleters_) { + deleter(); + } + } + + void set_extended_deleter(ur_context_extended_deleter_t function, + void *user_data) { + std::lock_guard guard(mutex_); + extended_deleters_.emplace_back(deleter_data{function, user_data}); + } + + ur_device_handle_t get_device() const noexcept { return deviceId_; } + + native_type get() const noexcept { return hipContext_; } + + bool is_primary() const noexcept { return kind_ == kind::primary; } + + uint32_t increment_reference_count() noexcept { return ++refCount_; } + + uint32_t decrement_reference_count() noexcept { return --refCount_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } + +private: + std::mutex mutex_; + std::vector extended_deleters_; +}; + +namespace { +/// RAII type to guarantee recovering original HIP context +/// Scoped context is used across all UR HIP plugin implementation +/// to activate the UR Context on the current thread, matching the +/// HIP driver semantics where the context used for the HIP Driver +/// API is the one active on the thread. +/// The implementation tries to avoid replacing the hipCtx_t if it cans +class ScopedContext { + ur_context_handle_t placedContext_; + hipCtx_t original_; + bool needToRecover_; + +public: + // TODO(ur): Needed for compatibility with PI; once the HIP PI plugin is + // fully moved over we can drop this constructor + ScopedContext(pi_context ctxt); + + ScopedContext(ur_context_handle_t ctxt) + : placedContext_{ctxt}, needToRecover_{false} { + + if (!placedContext_) { + throw UR_RESULT_ERROR_INVALID_CONTEXT; + } + + hipCtx_t desired = placedContext_->get(); + UR_CHECK_ERROR(hipCtxGetCurrent(&original_)); + if (original_ != desired) { + // Sets the desired context as the active one for the thread + UR_CHECK_ERROR(hipCtxSetCurrent(desired)); + if (original_ == nullptr) { + // No context is installed on the current thread + // This is the most common case. We can activate the context in the + // thread and leave it there until all the UR context referring to the + // same underlying HIP context are destroyed. This emulates + // the behaviour of the HIP runtime api, and avoids costly context + // switches. No action is required on this side of the if. + } else { + needToRecover_ = true; + } + } + } + + ~ScopedContext() { + if (needToRecover_) { + UR_CHECK_ERROR(hipCtxSetCurrent(original_)); + } + } +}; +} // namespace \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp new file mode 100644 index 0000000000000..0f57a2c8d0096 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -0,0 +1,918 @@ +//===--------- device.cpp - HIP Adapter -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "device.hpp" +#include "context.hpp" + +#include + +int getAttribute(ur_device_handle_t device, hipDeviceAttribute_t attribute) { + int value; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&value, attribute, device->get()) == hipSuccess); + return value; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, + ur_device_info_t infoType, + size_t propSize, + void *pDeviceInfo, + size_t *pPropSizeRet) { + UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UrReturnHelper ReturnValue(propSize, pDeviceInfo, pPropSizeRet); + + static constexpr uint32_t max_work_item_dimensions = 3u; + + switch ((uint32_t)infoType) { + case UR_DEVICE_INFO_TYPE: { + return ReturnValue(UR_DEVICE_TYPE_GPU); + } + case UR_DEVICE_INFO_VENDOR_ID: { +#if defined(__HIP_PLATFORM_AMD__) + uint32_t vendor_id = 4098u; +#elif defined(__HIP_PLATFORM_NVIDIA__) + uint32_t vendor_id = 4318u; +#else + uint32_t vendor_id = 0u; +#endif + return ReturnValue(vendor_id); + } + case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { + int compute_units = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&compute_units, + hipDeviceAttributeMultiprocessorCount, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(compute_units >= 0); + return ReturnValue(static_cast(compute_units)); + } + case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { + return ReturnValue(max_work_item_dimensions); + } + case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { + struct { + size_t sizes[max_work_item_dimensions]; + } return_sizes; + + int max_x = 0, max_y = 0, max_z = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxBlockDimX, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(max_x >= 0); + + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxBlockDimY, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(max_y >= 0); + + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxBlockDimZ, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(max_z >= 0); + + return_sizes.sizes[0] = size_t(max_x); + return_sizes.sizes[1] = size_t(max_y); + return_sizes.sizes[2] = size_t(max_z); + return ReturnValue(return_sizes); + } + + case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D: { + struct { + size_t sizes[max_work_item_dimensions]; + } return_sizes; + + int max_x = 0, max_y = 0, max_z = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxGridDimX, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(max_x >= 0); + + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxGridDimY, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(max_y >= 0); + + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxGridDimZ, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(max_z >= 0); + + return_sizes.sizes[0] = size_t(max_x); + return_sizes.sizes[1] = size_t(max_y); + return_sizes.sizes[2] = size_t(max_z); + return ReturnValue(return_sizes); + } + + case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { + int max_work_group_size = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_work_group_size, + hipDeviceAttributeMaxThreadsPerBlock, + device->get()) == hipSuccess); + + sycl::detail::ur::assertion(max_work_group_size >= 0); + + return ReturnValue(size_t(max_work_group_size)); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { + // Number of sub-groups = max block size / warp size + possible remainder + int max_threads = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_threads, + hipDeviceAttributeMaxThreadsPerBlock, + device->get()) == hipSuccess); + int warpSize = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, + device->get()) == hipSuccess); + int maxWarps = (max_threads + warpSize - 1) / warpSize; + return ReturnValue(maxWarps); + } + case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { + // Volta provides independent thread scheduling + // TODO: Revisit for previous generation GPUs + int major = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, + device->get()) == hipSuccess); + bool ifp = (major >= 7); + return ReturnValue(ifp); + } + case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { + int warpSize = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, + device->get()) == hipSuccess); + size_t sizes[1] = {static_cast(warpSize)}; + return ReturnValue(sizes, 1); + } + case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { + int clock_freq = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&clock_freq, hipDeviceAttributeClockRate, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(clock_freq >= 0); + return ReturnValue(static_cast(clock_freq) / 1000u); + } + case UR_DEVICE_INFO_ADDRESS_BITS: { + auto bits = uint32_t{std::numeric_limits::digits}; + return ReturnValue(bits); + } + case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: { + // Max size of memory object allocation in bytes. + // The minimum value is max(min(1024 × 1024 × + // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE), + // 32 × 1024 × 1024) for devices that are not of type + // CL_DEVICE_TYPE_CUSTOM. + + size_t global = 0; + sycl::detail::ur::assertion(hipDeviceTotalMem(&global, device->get()) == + hipSuccess); + + auto quarter_global = static_cast(global / 4u); + + auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global), + 32u * 1024u * 1024u); + + return ReturnValue(uint64_t{max_alloc}); + } + case UR_DEVICE_INFO_IMAGE_SUPPORTED: { + return ReturnValue(uint32_t{true}); + } + case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: { + // This call doesn't match to HIP as it doesn't have images, but instead + // surfaces and textures. No clear call in the HIP API to determine this, + // but some searching found as of SM 2.x 128 are supported. + return ReturnValue(128u); + } + case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: { + // This call doesn't match to HIP as it doesn't have images, but instead + // surfaces and textures. No clear call in the HIP API to determine this, + // but some searching found as of SM 2.x 128 are supported. + return ReturnValue(128u); + } + case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: { + // Take the smaller of maximum surface and maximum texture height. + int tex_height = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture2DHeight, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(tex_height >= 0); + int surf_height = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&surf_height, + hipDeviceAttributeMaxTexture2DHeight, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(surf_height >= 0); + + int min = std::min(tex_height, surf_height); + + return ReturnValue(static_cast(min)); + } + case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: { + // Take the smaller of maximum surface and maximum texture width. + int tex_width = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture2DWidth, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(tex_width >= 0); + int surf_width = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture2DWidth, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(surf_width >= 0); + + int min = std::min(tex_width, surf_width); + + return ReturnValue(static_cast(min)); + } + case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: { + // Take the smaller of maximum surface and maximum texture height. + int tex_height = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture3DHeight, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(tex_height >= 0); + int surf_height = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&surf_height, + hipDeviceAttributeMaxTexture3DHeight, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(surf_height >= 0); + + int min = std::min(tex_height, surf_height); + + return ReturnValue(static_cast(min)); + } + case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: { + // Take the smaller of maximum surface and maximum texture width. + int tex_width = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture3DWidth, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(tex_width >= 0); + int surf_width = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture3DWidth, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(surf_width >= 0); + + int min = std::min(tex_width, surf_width); + + return ReturnValue(static_cast(min)); + } + case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: { + // Take the smaller of maximum surface and maximum texture depth. + int tex_depth = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&tex_depth, hipDeviceAttributeMaxTexture3DDepth, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(tex_depth >= 0); + int surf_depth = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&surf_depth, hipDeviceAttributeMaxTexture3DDepth, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(surf_depth >= 0); + + int min = std::min(tex_depth, surf_depth); + + return ReturnValue(static_cast(min)); + } + case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: { + // Take the smaller of maximum surface and maximum texture width. + int tex_width = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture1DWidth, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(tex_width >= 0); + int surf_width = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture1DWidth, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(surf_width >= 0); + + int min = std::min(tex_width, surf_width); + + return ReturnValue(static_cast(min)); + } + case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: { + return ReturnValue(0lu); + } + case UR_DEVICE_INFO_MAX_SAMPLERS: { + // This call is kind of meaningless for HIP, as samplers don't exist. + // Closest thing is textures, which is 128. + return ReturnValue(128u); + } + case UR_DEVICE_INFO_MAX_PARAMETER_SIZE: { + // __global__ function parameters are passed to the device via constant + // memory and are limited to 4 KB. + return ReturnValue(4000lu); + } + case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { + int mem_base_addr_align = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&mem_base_addr_align, + hipDeviceAttributeTextureAlignment, + device->get()) == hipSuccess); + // Multiply by 8 as clGetDeviceInfo returns this value in bits + mem_base_addr_align *= 8; + return ReturnValue(mem_base_addr_align); + } + case UR_DEVICE_INFO_HALF_FP_CONFIG: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_SINGLE_FP_CONFIG: { + uint64_t config = + UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | + UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | + UR_DEVICE_FP_CAPABILITY_FLAG_FMA | + UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; + return ReturnValue(config); + } + case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: { + uint64_t config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | + UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | + UR_DEVICE_FP_CAPABILITY_FLAG_FMA; + return ReturnValue(config); + } + case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: { + return ReturnValue(UR_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE); + } + case UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: { + // The value is dohipmented for all existing GPUs in the HIP programming + // guidelines, section "H.3.2. Global Memory". + return ReturnValue(128u); + } + case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: { + int cache_size = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&cache_size, hipDeviceAttributeL2CacheSize, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(cache_size >= 0); + // The L2 cache is global to the GPU. + return ReturnValue(static_cast(cache_size)); + } + case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { + size_t bytes = 0; + // Runtime API has easy access to this value, driver API info is scarse. + sycl::detail::ur::assertion(hipDeviceTotalMem(&bytes, device->get()) == + hipSuccess); + return ReturnValue(uint64_t{bytes}); + } + case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { + int constant_memory = 0; + + // hipDeviceGetAttribute takes a int*, however the size of the constant + // memory on AMD GPU may be larger than what can fit in the positive part + // of a signed integer, so use an unsigned integer and cast the pointer to + // int*. + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&constant_memory, + hipDeviceAttributeTotalConstantMemory, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(constant_memory >= 0); + + return ReturnValue(static_cast(constant_memory)); + } + case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: { + // TODO: is there a way to retrieve this from HIP driver API? + // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX + // 1060 3GB + return ReturnValue(9u); + } + case UR_DEVICE_INFO_LOCAL_MEM_TYPE: { + return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL); + } + case UR_DEVICE_INFO_LOCAL_MEM_SIZE: { + // OpenCL's "local memory" maps most closely to HIP's "shared memory". + // HIP has its own definition of "local memory", which maps to OpenCL's + // "private memory". + int local_mem_size = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&local_mem_size, + hipDeviceAttributeMaxSharedMemoryPerBlock, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(local_mem_size >= 0); + return ReturnValue(static_cast(local_mem_size)); + } + case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { + int ecc_enabled = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&ecc_enabled, hipDeviceAttributeEccEnabled, + device->get()) == hipSuccess); + + sycl::detail::ur::assertion((ecc_enabled == 0) | (ecc_enabled == 1)); + auto result = static_cast(ecc_enabled); + return ReturnValue(result); + } + case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: { + int is_integrated = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&is_integrated, hipDeviceAttributeIntegrated, + device->get()) == hipSuccess); + + sycl::detail::ur::assertion((is_integrated == 0) | (is_integrated == 1)); + auto result = static_cast(is_integrated); + return ReturnValue(result); + } + case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: { + // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX + // 1060 3GB + return ReturnValue(1000lu); + } + case UR_DEVICE_INFO_ENDIAN_LITTLE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_AVAILABLE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_COMPILER_AVAILABLE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_LINKER_AVAILABLE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: { + auto capability = ur_device_exec_capability_flags_t{ + UR_DEVICE_EXEC_CAPABILITY_FLAG_KERNEL}; + return ReturnValue(capability); + } + case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: { + // The mandated minimum capability: + uint64_t capability = UR_QUEUE_FLAG_PROFILING_ENABLE | + UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; + return ReturnValue(capability); + } + case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: + case UR_DEVICE_INFO_QUEUE_PROPERTIES: { + // The mandated minimum capability: + uint64_t capability = UR_QUEUE_FLAG_PROFILING_ENABLE; + return ReturnValue(capability); + } + case UR_DEVICE_INFO_BUILT_IN_KERNELS: { + // An empty string is returned if no built-in kernels are supported by the + // device. + return ReturnValue(""); + } + case UR_DEVICE_INFO_PLATFORM: { + return ReturnValue(device->get_platform()); + } + case UR_DEVICE_INFO_NAME: { + static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u; + char name[MAX_DEVICE_NAME_LENGTH]; + sycl::detail::ur::assertion(hipDeviceGetName(name, MAX_DEVICE_NAME_LENGTH, + device->get()) == hipSuccess); + // On AMD GPUs hipDeviceGetName returns an empty string, so return the arch + // name instead, this is also what AMD OpenCL devices return. + if (strlen(name) == 0) { + hipDeviceProp_t props; + sycl::detail::ur::assertion( + hipGetDeviceProperties(&props, device->get()) == hipSuccess); + + return ReturnValue(props.gcnArchName, strlen(props.gcnArchName) + 1); + } + return ReturnValue(name, strlen(name) + 1); + } + case UR_DEVICE_INFO_VENDOR: { + return ReturnValue("AMD Corporation"); + } + case UR_DEVICE_INFO_DRIVER_VERSION: { + auto version = getHipVersionString(); + return ReturnValue(version.c_str()); + } + case UR_DEVICE_INFO_PROFILE: { + return ReturnValue("HIP"); + } + case UR_DEVICE_INFO_REFERENCE_COUNT: { + return ReturnValue(device->get_reference_count()); + } + case UR_DEVICE_INFO_VERSION: { + std::stringstream s; + + hipDeviceProp_t props; + sycl::detail::ur::assertion(hipGetDeviceProperties(&props, device->get()) == + hipSuccess); +#if defined(__HIP_PLATFORM_NVIDIA__) + s << props.major << "." << props.minor; +#elif defined(__HIP_PLATFORM_AMD__) + s << props.gcnArchName; +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif + return ReturnValue(s.str().c_str()); + } + case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: { + return ReturnValue(""); + } + case UR_DEVICE_INFO_EXTENSIONS: { + // TODO: Remove comment when HIP support native asserts. + // DEVICELIB_ASSERT extension is set so fallback assert + // postprocessing is NOP. HIP 4.3 docs indicate support for + // native asserts are in progress + std::string SupportedExtensions = ""; + SupportedExtensions += "pi_ext_intel_devicelib_assert "; + SupportedExtensions += " "; + + hipDeviceProp_t props; + sycl::detail::ur::assertion(hipGetDeviceProperties(&props, device->get()) == + hipSuccess); + + if (props.arch.hasDoubles) { + SupportedExtensions += "cl_khr_fp64 "; + } + + return ReturnValue(SupportedExtensions.c_str()); + } + case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: { + // The minimum value for the FULL profile is 1 MB. + return ReturnValue(1024lu); + } + case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_PARENT_DEVICE: { + return ReturnValue(nullptr); + } + case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_PARTITION_PROPERTIES: { + return ReturnValue(static_cast(0u)); + } + case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_PARTITION_TYPE: { + return ReturnValue(static_cast(0u)); + } + + // Intel USM extensions + case UR_DEVICE_INFO_USM_HOST_SUPPORT: { + // from cl_intel_unified_shared_memory: "The host memory access capabilities + // apply to any host allocation." + // + // query if/how the device can access page-locked host memory, possibly + // through PCIe, using the same pointer as the host + uint64_t value = {}; + // if (getAttribute(device, HIP_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) { + // the device shares a unified address space with the host + if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) { + // compute capability 6.x introduces operations that are atomic with + // respect to other CPUs and GPUs in the system + value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + } else { + // on GPU architectures with compute capability lower than 6.x, atomic + // operations from the GPU to CPU memory will not be atomic with respect + // to CPU initiated atomic operations + value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + } + return ReturnValue(value); + } + case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: { + // from cl_intel_unified_shared_memory: + // "The device memory access capabilities apply to any device allocation + // associated with this device." + // + // query how the device can access memory allocated on the device itself (?) + uint64_t value = + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + return ReturnValue(value); + } + case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: { + // from cl_intel_unified_shared_memory: + // "The single device shared memory access capabilities apply to any shared + // allocation associated with this device." + // + // query if/how the device can access managed memory associated to it + uint64_t value = {}; + if (getAttribute(device, hipDeviceAttributeManagedMemory)) { + // the device can allocate managed memory on this system + value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; + } + if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) { + // the device can coherently access managed memory concurrently with the + // CPU + value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) { + // compute capability 6.x introduces operations that are atomic with + // respect to other CPUs and GPUs in the system + value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + } + } + return ReturnValue(value); + } + case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: { + // from cl_intel_unified_shared_memory: + // "The cross-device shared memory access capabilities apply to any shared + // allocation associated with this device, or to any shared memory + // allocation on another device that also supports the same cross-device + // shared memory access capability." + // + // query if/how the device can access managed memory associated to other + // devices + uint64_t value = {}; + if (getAttribute(device, hipDeviceAttributeManagedMemory)) { + // the device can allocate managed memory on this system + value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS; + } + if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) { + // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS + // attribute can coherently access managed memory concurrently with the + // CPU + value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + } + if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) { + // compute capability 6.x introduces operations that are atomic with + // respect to other CPUs and GPUs in the system + if (value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS) + value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; + if (value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS) + value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + } + return ReturnValue(value); + } + case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { + // from cl_intel_unified_shared_memory: + // "The shared system memory access capabilities apply to any allocations + // made by a system allocator, such as malloc or new." + // + // query if/how the device can access pageable host memory allocated by the + // system allocator + uint64_t value = {}; + if (getAttribute(device, hipDeviceAttributePageableMemoryAccess)) { + // the link between the device and the host does not support native + // atomic operations + value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + } + return ReturnValue(value); + } + + case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: { + int major = 0, minor = 0; + sycl::detail::ur::assertion( + hipDeviceComputeCapability(&major, &minor, device->get()) == + hipSuccess); + std::string result = std::to_string(major) + "." + std::to_string(minor); + return ReturnValue(result.c_str()); + } + + case UR_DEVICE_INFO_ATOMIC_64: { + // TODO: Reconsider it when AMD supports SYCL_USE_NATIVE_FP_ATOMICS. + hipDeviceProp_t props; + sycl::detail::ur::assertion(hipGetDeviceProperties(&props, device->get()) == + hipSuccess); + return ReturnValue(props.arch.hasGlobalInt64Atomics && + props.arch.hasSharedInt64Atomics); + } + + case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { + size_t FreeMemory = 0; + size_t TotalMemory = 0; + sycl::detail::ur::assertion(hipMemGetInfo(&FreeMemory, &TotalMemory) == + hipSuccess, + "failed hipMemGetInfo() API."); + return ReturnValue(FreeMemory); + } + + case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { + int value = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&value, hipDeviceAttributeMemoryClockRate, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(value >= 0); + // Convert kilohertz to megahertz when returning. + return ReturnValue(value / 1000); + } + + case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: { + int value = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&value, hipDeviceAttributeMemoryBusWidth, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(value >= 0); + return ReturnValue(value); + } + case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { + return ReturnValue(int32_t{1}); + } + + case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + uint64_t capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE; + return ReturnValue(capabilities); + } + case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: + case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { + // SYCL2020 4.6.4.2 minimum mandated capabilities for + // atomic_fence/memory_scope_capabilities. + // Because scopes are hierarchical, wider scopes support all narrower + // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and + // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382) + uint64_t capabilities = UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP; + return ReturnValue(capabilities); + } + case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { + // SYCL2020 4.6.4.2 minimum mandated capabilities for + // atomic_fence_order_capabilities. + ur_memory_order_capability_flags_t capabilities = + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; + return ReturnValue(capabilities); + } + case UR_DEVICE_INFO_DEVICE_ID: { + int value = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&value, hipDeviceAttributePciDeviceId, + device->get()) == hipSuccess); + sycl::detail::ur::assertion(value >= 0); + return ReturnValue(value); + } + case UR_DEVICE_INFO_UUID: { +#if ((HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 2) || \ + HIP_VERSION_MAJOR > 5) + hipUUID uuid = {}; + // Supported since 5.2+ + sycl::detail::ur::assertion(hipDeviceGetUuid(&uuid, device->get()) == + hipSuccess); + std::array name; + std::copy(uuid.bytes, uuid.bytes + 16, name.begin()); + return ReturnValue(name.data(), 16); +#endif + return UR_RESULT_ERROR_INVALID_VALUE; + } + + case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: + return ReturnValue(false); + case UR_DEVICE_INFO_IMAGE_SRGB: + return ReturnValue(false); + + // TODO: Investigate if this information is available on HIP. + case UR_DEVICE_INFO_PCI_ADDRESS: + case UR_DEVICE_INFO_GPU_EU_COUNT: + case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH: + case UR_DEVICE_INFO_GPU_EU_SLICES: + case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: + case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: + case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU: + case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: + case UR_DEVICE_INFO_BFLOAT16: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + + default: + break; + } + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +/// \return UR_RESULT_SUCCESS if the function is executed successfully +/// HIP devices are always root devices so retain always returns success. +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t device) { + UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *, + uint32_t, ur_device_handle_t *, uint32_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +/// \return UR_RESULT_SUCCESS always since HIP devices are always root +/// devices. +UR_DLLEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t device) { + UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, + ur_device_type_t DeviceType, + uint32_t NumEntries, + ur_device_handle_t *phDevices, + uint32_t *pNumDevices) { + ur_result_t err = UR_RESULT_SUCCESS; + const bool askingForDefault = DeviceType == UR_DEVICE_TYPE_DEFAULT; + const bool askingForGPU = DeviceType == UR_DEVICE_TYPE_GPU; + const bool askingForAll = DeviceType == UR_DEVICE_TYPE_ALL; + const bool returnDevices = askingForDefault || askingForGPU || askingForAll; + + UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + size_t numDevices = returnDevices ? hPlatform->devices_.size() : 0; + + try { + UR_ASSERT(pNumDevices || phDevices, UR_RESULT_ERROR_INVALID_VALUE); + + if (pNumDevices) { + *pNumDevices = numDevices; + } + + if (returnDevices && phDevices) { + for (size_t i = 0; i < std::min(size_t(NumEntries), numDevices); ++i) { + phDevices[i] = hPlatform->devices_[i].get(); + } + } + + return err; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +/// Gets the native HIP handle of a UR device object +/// +/// \param[in] device The UR device to get the native HIP object of. +/// \param[out] nativeHandle Set to the native handle of the UR device object. +/// +/// \return UR_RESULT_SUCCESS + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( + ur_device_handle_t hDevice, ur_native_handle_t *phNativeHandle) { + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phNativeHandle, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + *phNativeHandle = reinterpret_cast(hDevice->get()); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( + ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, + ur_device_handle_t *phDevice) { + UR_ASSERT(hNativeDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + return UR_RESULT_ERROR_INVALID_OPERATION; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp new file mode 100644 index 0000000000000..2c434c2697319 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp @@ -0,0 +1,43 @@ +//===--------- device.hpp - HIP Adapter -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include "common.hpp" + +#include + +/// UR device mapping to a hipDevice_t. +/// Includes an observer pointer to the platform, +/// and implements the reference counting semantics since +/// HIP objects are not refcounted. +/// +struct ur_device_handle_t_ : public _ur_object { +private: + using native_type = hipDevice_t; + + native_type hipDevice_; + std::atomic_uint32_t refCount_; + ur_platform_handle_t platform_; + ur_context_handle_t context_; + +public: + ur_device_handle_t_(native_type hipDevice, ur_platform_handle_t platform) + : hipDevice_(hipDevice), refCount_{1}, platform_(platform) {} + + native_type get() const noexcept { return hipDevice_; }; + + uint32_t get_reference_count() const noexcept { return refCount_; } + + ur_platform_handle_t get_platform() const noexcept { return platform_; }; + + void set_context(ur_context_handle_t ctx) { context_ = ctx; }; + + ur_context_handle_t get_context() { return context_; }; +}; + +int getAttribute(ur_device_handle_t device, hipDeviceAttribute_t attribute); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp new file mode 100644 index 0000000000000..1cc2c098e4a62 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp @@ -0,0 +1,141 @@ +//===--------- platform.cpp - HIP Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "platform.hpp" + +hipEvent_t ur_platform_handle_t_::evBase_{nullptr}; + +UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo( + ur_platform_handle_t hPlatform, ur_platform_info_t PlatformInfoType, + size_t Size, void *pPlatformInfo, size_t *pSizeRet) { + + UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UrReturnHelper ReturnValue(Size, pPlatformInfo, pSizeRet); + + switch (PlatformInfoType) { + case UR_PLATFORM_INFO_NAME: + return ReturnValue("AMD HIP BACKEND"); + case UR_PLATFORM_INFO_VENDOR_NAME: + return ReturnValue("AMD Corporation"); + case UR_PLATFORM_INFO_PROFILE: + return ReturnValue("FULL PROFILE"); + case UR_PLATFORM_INFO_VERSION: { + auto version = getHipVersionString(); + return ReturnValue(version.c_str()); + } + case UR_PLATFORM_INFO_BACKEND: { + return ReturnValue(UR_PLATFORM_BACKEND_HIP); + } + case UR_PLATFORM_INFO_EXTENSIONS: { + return ReturnValue(""); + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + return UR_RESULT_SUCCESS; +} + +/// Obtains the HIP platform. +/// There is only one HIP platform, and contains all devices on the system. +/// Triggers the HIP Driver initialization (hipInit) the first time, so this +/// must be the first UR API called. +/// +/// However because multiple devices in a context is not currently supported, +/// place each device in a separate platform. +/// +UR_DLLEXPORT ur_result_t UR_APICALL +urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms, + uint32_t *pNumPlatforms) { + + try { + static std::once_flag initFlag; + static uint32_t numPlatforms = 1; + static std::vector platformIds; + + UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_VALUE); + + ur_result_t err = UR_RESULT_SUCCESS; + + std::call_once( + initFlag, + [](ur_result_t &err) { + if (hipInit(0) != hipSuccess) { + numPlatforms = 0; + return; + } + int numDevices = 0; + err = UR_CHECK_ERROR(hipGetDeviceCount(&numDevices)); + if (numDevices == 0) { + numPlatforms = 0; + return; + } + try { + // make one platform per device + numPlatforms = numDevices; + platformIds.resize(numDevices); + + for (int i = 0; i < numDevices; ++i) { + hipDevice_t device; + err = UR_CHECK_ERROR(hipDeviceGet(&device, i)); + platformIds[i].devices_.emplace_back( + new ur_device_handle_t_{device, &platformIds[i]}); + } + } catch (const std::bad_alloc &) { + // Signal out-of-memory situation + for (int i = 0; i < numDevices; ++i) { + platformIds[i].devices_.clear(); + } + platformIds.clear(); + err = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + // Clear and rethrow to allow retry + for (int i = 0; i < numDevices; ++i) { + platformIds[i].devices_.clear(); + } + platformIds.clear(); + throw; + } + }, + err); + + if (pNumPlatforms != nullptr) { + *pNumPlatforms = numPlatforms; + } + + if (phPlatforms != nullptr) { + for (unsigned i = 0; i < std::min(NumEntries, numPlatforms); ++i) { + phPlatforms[i] = &platformIds[i]; + } + } + + return err; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( + ur_platform_handle_t hDriver, ur_api_version_t *pVersion) { + UR_ASSERT(hDriver, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pVersion, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + *pVersion = UR_API_VERSION_CURRENT; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t) { + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) { + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp new file mode 100644 index 0000000000000..8337e52095bb8 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp @@ -0,0 +1,18 @@ +//===--------- platform.hpp - HIP Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include "common.hpp" +#include "device.hpp" + +#include + +struct ur_platform_handle_t_ : public _ur_platform { + static hipEvent_t evBase_; // HIP event used as base counter + std::vector> devices_; +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp new file mode 100644 index 0000000000000..4add49dbf4fe1 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -0,0 +1,258 @@ +//===--------- ur_interface_loader.cpp - Unified Runtime ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include +#include + +namespace { + +// TODO - this is a duplicate of what is in the L0 plugin +// We should move this to somewhere common +ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) { + if (nullptr == pDdiTable) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + // Pre 1.0 we enforce loader and adapter must have same version. + // Post 1.0 only major version match should be required. + if (version != UR_API_VERSION_CURRENT) { + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + } + return UR_RESULT_SUCCESS; +} +} // namespace + +#if defined(__cplusplus) +extern "C" { +#endif + +UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( + ur_api_version_t version, ur_platform_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGet = urPlatformGet; + pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion; + pDdiTable->pfnGetInfo = urPlatformGetInfo; + pDdiTable->pfnGetNativeHandle = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable( + ur_api_version_t version, ur_context_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = urContextCreate; + pDdiTable->pfnCreateWithNativeHandle = urContextCreateWithNativeHandle; + pDdiTable->pfnGetInfo = urContextGetInfo; + pDdiTable->pfnGetNativeHandle = urContextGetNativeHandle; + pDdiTable->pfnRelease = urContextRelease; + pDdiTable->pfnRetain = urContextRetain; + pDdiTable->pfnSetExtendedDeleter = urContextSetExtendedDeleter; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( + ur_api_version_t version, ur_event_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetProfilingInfo = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnSetCallback = nullptr; + pDdiTable->pfnWait = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( + ur_api_version_t version, ur_program_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnBuild = nullptr; + pDdiTable->pfnCompile = nullptr; + pDdiTable->pfnCreateWithBinary = nullptr; + pDdiTable->pfnCreateWithIL = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetBuildInfo = nullptr; + pDdiTable->pfnGetFunctionPointer = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnLink = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnSetSpecializationConstants = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( + ur_api_version_t version, ur_kernel_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetGroupInfo = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetSubGroupInfo = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnSetArgLocal = nullptr; + pDdiTable->pfnSetArgMemObj = nullptr; + pDdiTable->pfnSetArgPointer = nullptr; + pDdiTable->pfnSetArgSampler = nullptr; + pDdiTable->pfnSetArgValue = nullptr; + pDdiTable->pfnSetExecInfo = nullptr; + pDdiTable->pfnSetSpecializationConstants = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable( + ur_api_version_t version, ur_sampler_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL +urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnBufferCreate = nullptr; + pDdiTable->pfnBufferPartition = nullptr; + pDdiTable->pfnBufferCreateWithNativeHandle = nullptr; + pDdiTable->pfnImageCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnImageCreate = nullptr; + pDdiTable->pfnImageGetInfo = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( + ur_api_version_t version, ur_enqueue_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnDeviceGlobalVariableRead = nullptr; + pDdiTable->pfnDeviceGlobalVariableWrite = nullptr; + pDdiTable->pfnEventsWait = nullptr; + pDdiTable->pfnEventsWaitWithBarrier = nullptr; + pDdiTable->pfnKernelLaunch = nullptr; + pDdiTable->pfnMemBufferCopy = nullptr; + pDdiTable->pfnMemBufferCopyRect = nullptr; + pDdiTable->pfnMemBufferFill = nullptr; + pDdiTable->pfnMemBufferMap = nullptr; + pDdiTable->pfnMemBufferRead = nullptr; + pDdiTable->pfnMemBufferReadRect = nullptr; + pDdiTable->pfnMemBufferWrite = nullptr; + pDdiTable->pfnMemBufferWriteRect = nullptr; + pDdiTable->pfnMemImageCopy = nullptr; + pDdiTable->pfnMemImageRead = nullptr; + pDdiTable->pfnMemImageWrite = nullptr; + pDdiTable->pfnMemUnmap = nullptr; + pDdiTable->pfnUSMFill2D = nullptr; + pDdiTable->pfnUSMFill = nullptr; + pDdiTable->pfnUSMAdvise = nullptr; + pDdiTable->pfnUSMMemcpy2D = nullptr; + pDdiTable->pfnUSMMemcpy = nullptr; + pDdiTable->pfnUSMPrefetch = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( + ur_api_version_t version, ur_global_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnGetLastResult = nullptr; + pDdiTable->pfnInit = urInit; + pDdiTable->pfnTearDown = urTearDown; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( + ur_api_version_t version, ur_queue_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnFinish = nullptr; + pDdiTable->pfnFlush = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL +urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnDeviceAlloc = nullptr; + pDdiTable->pfnFree = nullptr; + pDdiTable->pfnGetMemAllocInfo = nullptr; + pDdiTable->pfnHostAlloc = nullptr; + pDdiTable->pfnPoolCreate = nullptr; + pDdiTable->pfnPoolDestroy = nullptr; + pDdiTable->pfnPoolDestroy = nullptr; + pDdiTable->pfnSharedAlloc = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( + ur_api_version_t version, ur_device_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = urDeviceCreateWithNativeHandle; + pDdiTable->pfnGet = urDeviceGet; + pDdiTable->pfnGetGlobalTimestamps = nullptr; + pDdiTable->pfnGetInfo = urDeviceGetInfo; + pDdiTable->pfnGetNativeHandle = urDeviceGetNativeHandle; + pDdiTable->pfnPartition = urDevicePartition; + pDdiTable->pfnRelease = urDeviceRelease; + pDdiTable->pfnRetain = urDeviceRetain; + pDdiTable->pfnSelectBinary = nullptr; + return UR_RESULT_SUCCESS; +} + +#if defined(__cplusplus) +} // extern "C" +#endif diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index 2099b31529176..6978c6f2838a6 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -269,6 +269,10 @@ getInfo(size_t param_value_size, void *param_value, } } // namespace ur +// FIXME: This class will cause failures in the UR CTS tests as it is used in UR +// getInfo entry-points, this should be okay for now to make sycl-rt works +// correctly with the existing PI layer. But, it should be deleted once the PI +// layer is completely ported to UR and deleted. class UrReturnHelper { public: UrReturnHelper(size_t param_value_size, void *param_value, From d9effb3148c20d7b1f34978240190041ee94b4d8 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 18 May 2023 14:57:11 +0100 Subject: [PATCH 02/42] [SYCL][HIP][UR] Add setErrorMessage mechanism and remove ur_object base class --- .../unified_runtime/ur/adapters/hip/common.cpp | 12 ++++++++++++ .../unified_runtime/ur/adapters/hip/common.hpp | 8 ++++++++ .../unified_runtime/ur/adapters/hip/context.hpp | 2 +- .../unified_runtime/ur/adapters/hip/device.hpp | 2 +- .../unified_runtime/ur/adapters/hip/platform.hpp | 2 +- 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp index 28777e9e9d085..e2c4f967b24a6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp @@ -82,3 +82,15 @@ void sycl::detail::ur::assertion(bool Condition, const char *Message) { void sycl::detail::ur::hipPrint(const char *Message) { std::cerr << "ur_print: " << Message << std::endl; } + +// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR +thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS; +thread_local char ErrorMessage[MaxMessageSize]; + +// Utility function for setting a message and warning +[[maybe_unused]] void setErrorMessage(const char *message, + ur_result_t error_code) { + assert(strlen(message) <= MaxMessageSize); + strcpy(ErrorMessage, message); + ErrorMessageCode = error_code; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp index 2d32ebf0abd34..b7a1da0e48784 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp @@ -76,6 +76,14 @@ ur_result_t check_error_ur(hipError_t result, const char *function, int line, std::string getHipVersionString(); +constexpr size_t MaxMessageSize = 256; +extern thread_local ur_result_t ErrorMessageCode; +extern thread_local char ErrorMessage[MaxMessageSize]; + +// Utility function for setting a message and warning +[[maybe_unused]] void setErrorMessage(const char *message, + ur_result_t error_code); + /// ------ Error handling, matching OpenCL plugin semantics. namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp index da634c3dbe474..3037f2943dc8a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp @@ -17,7 +17,7 @@ using pi_context = _pi_context *; typedef void (*ur_context_extended_deleter_t)(void *user_data); -struct ur_context_handle_t_ : public _ur_object { +struct ur_context_handle_t_ { struct deleter_data { ur_context_extended_deleter_t function; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp index 2c434c2697319..578e003223d4c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp @@ -16,7 +16,7 @@ /// and implements the reference counting semantics since /// HIP objects are not refcounted. /// -struct ur_device_handle_t_ : public _ur_object { +struct ur_device_handle_t_ { private: using native_type = hipDevice_t; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp index 8337e52095bb8..fb89d5bea24ea 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp @@ -12,7 +12,7 @@ #include -struct ur_platform_handle_t_ : public _ur_platform { +struct ur_platform_handle_t_ { static hipEvent_t evBase_; // HIP event used as base counter std::vector> devices_; }; From 0e3cfa90718b3d5fc1afd00d82c8c26055e20b7d Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 18 May 2023 16:20:54 +0100 Subject: [PATCH 03/42] [SYCL][HIP][UR] Add MAX_REGISTERS_PER_WORK_GROUP device query --- .../unified_runtime/ur/adapters/hip/device.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index 0f57a2c8d0096..a537fe58328d5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -810,7 +810,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, #endif return UR_RESULT_ERROR_INVALID_VALUE; } + case UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: { + // Maximum number of 32-bit registers available to a thread block. + // Note: This number is shared by all thread blocks simultaneously resident + // on a multiprocessor. + int max_registers{-1}; + UR_CHECK_ERROR(hipDeviceGetAttribute( + &max_registers, hipDeviceAttributeMaxRegistersPerBlock, device->get())); + sycl::detail::ur::assertion(max_registers >= 0); + + return ReturnValue(static_cast(max_registers)); + } case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: return ReturnValue(false); case UR_DEVICE_INFO_IMAGE_SRGB: From 989014dbe3a1ef2a6572b0ce013cdf793f5b2ac8 Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Fri, 12 May 2023 15:10:45 +0100 Subject: [PATCH 04/42] [SYCL][HIP][UR] Port memory entry points --- sycl/plugins/hip/CMakeLists.txt | 2 + sycl/plugins/hip/pi_hip.cpp | 629 ++---------------- sycl/plugins/hip/pi_hip.hpp | 181 +---- sycl/plugins/unified_runtime/CMakeLists.txt | 2 + .../ur/adapters/hip/common.hpp | 77 ++- .../ur/adapters/hip/memory.cpp | 530 +++++++++++++++ .../ur/adapters/hip/memory.hpp | 201 ++++++ .../ur/adapters/hip/ur_interface_loader.cpp | 21 +- 8 files changed, 875 insertions(+), 768 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt index dca1167124811..aa8500ed0a92d 100644 --- a/sycl/plugins/hip/CMakeLists.txt +++ b/sycl/plugins/hip/CMakeLists.txt @@ -100,6 +100,8 @@ add_sycl_plugin(hip "../unified_runtime/ur/adapters/hip/device.hpp" "../unified_runtime/ur/adapters/hip/platform.cpp" "../unified_runtime/ur/adapters/hip/platform.hpp" + "../unified_runtime/ur/adapters/hip/memory.cpp" + "../unified_runtime/ur/adapters/hip/memory.hpp" "../unified_runtime/ur/adapters/hip/ur_interface_loader.cpp" "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index fdf335c923c99..b29336d19a924 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -455,9 +455,9 @@ hipStream_t _pi_queue::get_next_transfer_stream() { _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue, hipStream_t stream, pi_uint32 stream_token) : commandType_{type}, refCount_{1}, hasBeenWaitedOn_{false}, - isRecorded_{false}, isStarted_{false}, streamToken_{stream_token}, - evEnd_{nullptr}, evStart_{nullptr}, evQueued_{nullptr}, queue_{queue}, - stream_{stream}, context_{context} { + isRecorded_{false}, isStarted_{false}, + streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr}, + evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} { assert(type != PI_COMMAND_TYPE_USER); @@ -687,81 +687,6 @@ std::string getKernelNames(pi_program program) { return {}; } -/// RAII object that calls the reference count release function on the held PI -/// object on destruction. -/// -/// The `dismiss` function stops the release from happening on destruction. -template class ReleaseGuard { -private: - T Captive; - - static pi_result callRelease(pi_device Captive) { - return pi2ur::piDeviceRelease(Captive); - } - - static pi_result callRelease(pi_context Captive) { - return pi2ur::piContextRelease(Captive); - } - - static pi_result callRelease(pi_mem Captive) { - return hip_piMemRelease(Captive); - } - - static pi_result callRelease(pi_program Captive) { - return hip_piProgramRelease(Captive); - } - - static pi_result callRelease(pi_kernel Captive) { - return hip_piKernelRelease(Captive); - } - - static pi_result callRelease(pi_queue Captive) { - return hip_piQueueRelease(Captive); - } - - static pi_result callRelease(pi_event Captive) { - return hip_piEventRelease(Captive); - } - -public: - ReleaseGuard() = delete; - /// Obj can be `nullptr`. - explicit ReleaseGuard(T Obj) : Captive(Obj) {} - ReleaseGuard(ReleaseGuard &&Other) noexcept : Captive(Other.Captive) { - Other.Captive = nullptr; - } - - ReleaseGuard(const ReleaseGuard &) = delete; - - /// Calls the related PI object release function if the object held is not - /// `nullptr` or if `dismiss` has not been called. - ~ReleaseGuard() { - if (Captive != nullptr) { - pi_result ret = callRelease(Captive); - if (ret != PI_SUCCESS) { - // A reported HIP error is either an implementation or an asynchronous - // HIP error for which it is unclear if the function that reported it - // succeeded or not. Either way, the state of the program is compromised - // and likely unrecoverable. - sycl::detail::pi::die( - "Unrecoverable program state reached in hip_piMemRelease"); - } - } - } - - ReleaseGuard &operator=(const ReleaseGuard &) = delete; - - ReleaseGuard &operator=(ReleaseGuard &&Other) { - Captive = Other.Captive; - Other.Captive = nullptr; - return *this; - } - - /// End the guard and do not release the reference count of the held - /// PI object. - void dismiss() { Captive = nullptr; } -}; - //-- PI API implementation extern "C" { @@ -823,319 +748,6 @@ pi_result hip_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device, return retError; } -/// Creates a PI Memory object using a HIP memory allocation. -/// Can trigger a manual copy depending on the mode. -/// \TODO Implement USE_HOST_PTR using cuHostRegister -/// -pi_result -hip_piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, - void *host_ptr, pi_mem *ret_mem, - [[maybe_unused]] const pi_mem_properties *properties) { - // Need input memory object - assert(ret_mem != nullptr); - assert((properties == nullptr || *properties == 0) && - "no mem properties goes to HIP RT yet"); - // Currently, USE_HOST_PTR is not implemented using host register - // since this triggers a weird segfault after program ends. - // Setting this constant to true enables testing that behavior. - const bool enableUseHostPtr = false; - const bool performInitialCopy = - (flags & PI_MEM_FLAGS_HOST_PTR_COPY) || - ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && !enableUseHostPtr); - pi_result retErr = PI_SUCCESS; - pi_mem retMemObj = nullptr; - - try { - ScopedContext active(context); - void *ptr; - _pi_mem::mem_::buffer_mem_::alloc_mode allocMode = - _pi_mem::mem_::buffer_mem_::alloc_mode::classic; - - if ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && enableUseHostPtr) { - retErr = PI_CHECK_ERROR( - hipHostRegister(host_ptr, size, hipHostRegisterMapped)); - retErr = PI_CHECK_ERROR(hipHostGetDevicePointer(&ptr, host_ptr, 0)); - allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr; - } else if (flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) { - retErr = PI_CHECK_ERROR(hipHostMalloc(&host_ptr, size)); - retErr = PI_CHECK_ERROR(hipHostGetDevicePointer(&ptr, host_ptr, 0)); - allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; - } else { - retErr = PI_CHECK_ERROR(hipMalloc(&ptr, size)); - if (flags & PI_MEM_FLAGS_HOST_PTR_COPY) { - allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in; - } - } - - if (retErr == PI_SUCCESS) { - pi_mem parentBuffer = nullptr; - - auto devPtr = - reinterpret_cast<_pi_mem::mem_::mem_::buffer_mem_::native_type>(ptr); - auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{ - context, parentBuffer, allocMode, devPtr, host_ptr, size}); - if (piMemObj != nullptr) { - retMemObj = piMemObj.release(); - if (performInitialCopy) { - // Operates on the default stream of the current HIP context. - retErr = PI_CHECK_ERROR(hipMemcpyHtoD(devPtr, host_ptr, size)); - // Synchronize with default stream implicitly used by cuMemcpyHtoD - // to make buffer data available on device before any other PI call - // uses it. - if (retErr == PI_SUCCESS) { - hipStream_t defaultStream = 0; - retErr = PI_CHECK_ERROR(hipStreamSynchronize(defaultStream)); - } - } - } else { - retErr = PI_ERROR_OUT_OF_HOST_MEMORY; - } - } - } catch (pi_result err) { - retErr = err; - } catch (...) { - retErr = PI_ERROR_OUT_OF_RESOURCES; - } - - *ret_mem = retMemObj; - - return retErr; -} - -/// Decreases the reference count of the Mem object. -/// If this is zero, calls the relevant HIP Free function -/// \return PI_SUCCESS unless deallocation error -/// -pi_result hip_piMemRelease(pi_mem memObj) { - assert((memObj != nullptr) && "PI_ERROR_INVALID_MEM_OBJECTS"); - - pi_result ret = PI_SUCCESS; - - try { - - // Do nothing if there are other references - if (memObj->decrement_reference_count() > 0) { - return PI_SUCCESS; - } - - // make sure memObj is released in case PI_CHECK_ERROR throws - std::unique_ptr<_pi_mem> uniqueMemObj(memObj); - - if (memObj->is_sub_buffer()) { - return PI_SUCCESS; - } - - ScopedContext active(uniqueMemObj->get_context()); - - if (memObj->mem_type_ == _pi_mem::mem_type::buffer) { - switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) { - case _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in: - case _pi_mem::mem_::buffer_mem_::alloc_mode::classic: - ret = PI_CHECK_ERROR( - hipFree((void *)uniqueMemObj->mem_.buffer_mem_.ptr_)); - break; - case _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr: - ret = PI_CHECK_ERROR( - hipHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); - break; - case _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr: - ret = PI_CHECK_ERROR( - hipFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); - }; - } - - else if (memObj->mem_type_ == _pi_mem::mem_type::surface) { - ret = PI_CHECK_ERROR(hipDestroySurfaceObject( - uniqueMemObj->mem_.surface_mem_.get_surface())); - auto array = uniqueMemObj->mem_.surface_mem_.get_array(); - ret = PI_CHECK_ERROR(hipFreeArray(array)); - } - - } catch (pi_result err) { - ret = err; - } catch (...) { - ret = PI_ERROR_OUT_OF_RESOURCES; - } - - if (ret != PI_SUCCESS) { - // A reported HIP error is either an implementation or an asynchronous HIP - // error for which it is unclear if the function that reported it succeeded - // or not. Either way, the state of the program is compromised and likely - // unrecoverable. - sycl::detail::pi::die( - "Unrecoverable program state reached in hip_piMemRelease"); - } - - return PI_SUCCESS; -} - -/// Implements a buffer partition in the HIP backend. -/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented -/// as an offset over an existing HIP allocation. -/// -pi_result hip_piMemBufferPartition( - pi_mem parent_buffer, pi_mem_flags flags, - [[maybe_unused]] pi_buffer_create_type buffer_create_type, - void *buffer_create_info, pi_mem *memObj) { - assert((parent_buffer != nullptr) && "PI_ERROR_INVALID_MEM_OBJECT"); - assert(parent_buffer->is_buffer() && "PI_ERROR_INVALID_MEM_OBJECTS"); - assert(!parent_buffer->is_sub_buffer() && "PI_ERROR_INVALID_MEM_OBJECT"); - - // Default value for flags means PI_MEM_FLAGS_ACCCESS_RW. - if (flags == 0) { - flags = PI_MEM_FLAGS_ACCESS_RW; - } - - assert((flags == PI_MEM_FLAGS_ACCESS_RW) && "PI_ERROR_INVALID_VALUE"); - assert((buffer_create_type == PI_BUFFER_CREATE_TYPE_REGION) && - "PI_ERROR_INVALID_VALUE"); - assert((buffer_create_info != nullptr) && "PI_ERROR_INVALID_VALUE"); - assert(memObj != nullptr); - - const auto bufferRegion = - *reinterpret_cast(buffer_create_info); - assert((bufferRegion.size != 0u) && "PI_ERROR_INVALID_BUFFER_SIZE"); - - assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) && - "Overflow"); - assert(((bufferRegion.origin + bufferRegion.size) <= - parent_buffer->mem_.buffer_mem_.get_size()) && - "PI_ERROR_INVALID_BUFFER_SIZE"); - // Retained indirectly due to retaining parent buffer below. - pi_context context = parent_buffer->context_; - _pi_mem::mem_::buffer_mem_::alloc_mode allocMode = - _pi_mem::mem_::buffer_mem_::alloc_mode::classic; - - assert(parent_buffer->mem_.buffer_mem_.ptr_ != - _pi_mem::mem_::buffer_mem_::native_type{0}); - _pi_mem::mem_::buffer_mem_::native_type ptr = - parent_buffer->mem_.buffer_mem_.get_with_offset(bufferRegion.origin); - - void *hostPtr = nullptr; - if (parent_buffer->mem_.buffer_mem_.hostPtr_) { - hostPtr = static_cast(parent_buffer->mem_.buffer_mem_.hostPtr_) + - bufferRegion.origin; - } - - ReleaseGuard releaseGuard(parent_buffer); - - std::unique_ptr<_pi_mem> retMemObj{nullptr}; - try { - ScopedContext active(context); - - retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{ - context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}}; - } catch (pi_result err) { - *memObj = nullptr; - return err; - } catch (...) { - *memObj = nullptr; - return PI_ERROR_OUT_OF_HOST_MEMORY; - } - - releaseGuard.dismiss(); - *memObj = retMemObj.release(); - return PI_SUCCESS; -} - -pi_result hip_piMemGetInfo(pi_mem memObj, pi_mem_info queriedInfo, - size_t expectedQuerySize, void *queryOutput, - size_t *writtenQuerySize) { - (void)memObj; - (void)queriedInfo; - (void)expectedQuerySize; - (void)queryOutput; - (void)writtenQuerySize; - - sycl::detail::pi::die("hip_piMemGetInfo not implemented"); -} - -/// Gets the native HIP handle of a PI mem object -/// -/// \param[in] mem The PI mem to get the native HIP object of. -/// \param[out] nativeHandle Set to the native handle of the PI mem object. -/// -/// \return PI_SUCCESS -pi_result hip_piextMemGetNativeHandle(pi_mem mem, - pi_native_handle *nativeHandle) { -#if defined(__HIP_PLATFORM_NVIDIA__) - if (sizeof(_pi_mem::mem_::buffer_mem_::native_type) > - sizeof(pi_native_handle)) { - // Check that all the upper bits that cannot be represented by - // pi_native_handle are empty. - // NOTE: The following shift might trigger a warning, but the check in the - // if above makes sure that this does not underflow. - _pi_mem::mem_::buffer_mem_::native_type upperBits = - mem->mem_.buffer_mem_.get() >> (sizeof(pi_native_handle) * CHAR_BIT); - if (upperBits) { - // Return an error if any of the remaining bits is non-zero. - return PI_ERROR_INVALID_MEM_OBJECT; - } - } - *nativeHandle = static_cast(mem->mem_.buffer_mem_.get()); -#elif defined(__HIP_PLATFORM_AMD__) - *nativeHandle = - reinterpret_cast(mem->mem_.buffer_mem_.get()); -#else -#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); -#endif - return PI_SUCCESS; -} - -/// Created a PI mem object from a HIP mem handle. -/// TODO: Implement this. -/// NOTE: The created PI object takes ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI mem object from. -/// \param[in] context The PI context of the memory allocation. -/// \param[in] ownNativeHandle Indicates if we own the native memory handle or -/// it came from interop that asked to not transfer the ownership to SYCL RT. -/// \param[out] mem Set to the PI mem object created from native handle. -/// -/// \return TBD -pi_result hip_piextMemCreateWithNativeHandle(pi_native_handle nativeHandle, - pi_context context, - bool ownNativeHandle, - pi_mem *mem) { - (void)nativeHandle; - (void)context; - (void)ownNativeHandle; - (void)mem; - - sycl::detail::pi::die( - "Creation of PI mem from native handle not implemented"); - return {}; -} - -/// Created a PI image mem object from a HIP image mem handle. -/// TODO: Implement this. -/// NOTE: The created PI object takes ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI mem object from. -/// \param[in] context The PI context of the memory allocation. -/// \param[in] ownNativeHandle Indicates if we own the native memory handle or -/// it came from interop that asked to not transfer the ownership to SYCL RT. -/// \param[in] ImageFormat The format of the image. -/// \param[in] ImageDesc The description information for the image. -/// \param[out] mem Set to the PI mem object created from native handle. -/// -/// \return TBD -pi_result hip_piextMemImageCreateWithNativeHandle( - pi_native_handle nativeHandle, pi_context context, bool ownNativeHandle, - const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc, - pi_mem *mem) { - (void)nativeHandle; - (void)context; - (void)ownNativeHandle; - (void)ImageFormat; - (void)ImageDesc; - (void)mem; - - sycl::detail::pi::die( - "Creation of PI mem from native image handle not implemented"); - return {}; -} - /// Creates a `pi_queue` object on the HIP backend. /// Valid properties /// * __SYCL_PI_HIP_USE_DEFAULT_STREAM -> hipStreamDefault @@ -1766,182 +1378,6 @@ hip_piEnqueueNativeKernel(pi_queue queue, void (*user_func)(void *), void *args, return {}; } -/// \TODO Not implemented - -pi_result hip_piMemImageCreate(pi_context context, pi_mem_flags flags, - const pi_image_format *image_format, - const pi_image_desc *image_desc, void *host_ptr, - pi_mem *ret_mem) { - - // Need input memory object - assert(ret_mem != nullptr); - const bool performInitialCopy = (flags & PI_MEM_FLAGS_HOST_PTR_COPY) || - ((flags & PI_MEM_FLAGS_HOST_PTR_USE)); - pi_result retErr = PI_SUCCESS; - - // We only support RBGA channel order - // TODO: check SYCL CTS and spec. May also have to support BGRA - if (image_format->image_channel_order != - pi_image_channel_order::PI_IMAGE_CHANNEL_ORDER_RGBA) { - sycl::detail::pi::die( - "hip_piMemImageCreate only supports RGBA channel order"); - } - - // We have to use cuArray3DCreate, which has some caveats. The height and - // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives - // a minimum value of 1, so we need to convert the answer. - HIP_ARRAY3D_DESCRIPTOR array_desc; - array_desc.NumChannels = 4; // Only support 4 channel image - array_desc.Flags = 0; // No flags required - array_desc.Width = image_desc->image_width; - if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) { - array_desc.Height = 0; - array_desc.Depth = 0; - } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) { - array_desc.Height = image_desc->image_height; - array_desc.Depth = 0; - } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) { - array_desc.Height = image_desc->image_height; - array_desc.Depth = image_desc->image_depth; - } - - // We need to get this now in bytes for calculating the total image size later - size_t pixel_type_size_bytes; - - switch (image_format->image_channel_data_type) { - case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8: - case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: - array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT8; - pixel_type_size_bytes = 1; - break; - case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8: - array_desc.Format = HIP_AD_FORMAT_SIGNED_INT8; - pixel_type_size_bytes = 1; - break; - case PI_IMAGE_CHANNEL_TYPE_UNORM_INT16: - case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: - array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT16; - pixel_type_size_bytes = 2; - break; - case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16: - array_desc.Format = HIP_AD_FORMAT_SIGNED_INT16; - pixel_type_size_bytes = 2; - break; - case PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT: - array_desc.Format = HIP_AD_FORMAT_HALF; - pixel_type_size_bytes = 2; - break; - case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: - array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT32; - pixel_type_size_bytes = 4; - break; - case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32: - array_desc.Format = HIP_AD_FORMAT_SIGNED_INT32; - pixel_type_size_bytes = 4; - break; - case PI_IMAGE_CHANNEL_TYPE_FLOAT: - array_desc.Format = HIP_AD_FORMAT_FLOAT; - pixel_type_size_bytes = 4; - break; - default: - sycl::detail::pi::die( - "hip_piMemImageCreate given unsupported image_channel_data_type"); - } - - // When a dimension isn't used image_desc has the size set to 1 - size_t pixel_size_bytes = - pixel_type_size_bytes * 4; // 4 is the only number of channels we support - size_t image_size_bytes = pixel_size_bytes * image_desc->image_width * - image_desc->image_height * image_desc->image_depth; - - ScopedContext active(context); - hipArray *image_array; - retErr = PI_CHECK_ERROR(hipArray3DCreate( - reinterpret_cast(&image_array), &array_desc)); - - try { - if (performInitialCopy) { - // We have to use a different copy function for each image dimensionality - if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) { - retErr = PI_CHECK_ERROR( - hipMemcpyHtoA(image_array, 0, host_ptr, image_size_bytes)); - } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) { - hip_Memcpy2D cpy_desc; - memset(&cpy_desc, 0, sizeof(cpy_desc)); - cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost; - cpy_desc.srcHost = host_ptr; - cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray; - cpy_desc.dstArray = reinterpret_cast(image_array); - cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width; - cpy_desc.Height = image_desc->image_height; - retErr = PI_CHECK_ERROR(hipMemcpyParam2D(&cpy_desc)); - } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) { - HIP_MEMCPY3D cpy_desc; - memset(&cpy_desc, 0, sizeof(cpy_desc)); - cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost; - cpy_desc.srcHost = host_ptr; - cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray; - cpy_desc.dstArray = reinterpret_cast(image_array); - cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width; - cpy_desc.Height = image_desc->image_height; - cpy_desc.Depth = image_desc->image_depth; - retErr = PI_CHECK_ERROR(hipDrvMemcpy3D(&cpy_desc)); - } - } - - // HIP_RESOURCE_DESC is a union of different structs, shown here - // We need to fill it as described here to use it for a surface or texture - // HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and - // HIP_RESOURCE_DESC::res::array::hArray must be set to a valid HIP array - // handle. - // HIP_RESOURCE_DESC::flags must be set to zero - - hipResourceDesc image_res_desc; - image_res_desc.res.array.array = image_array; - image_res_desc.resType = hipResourceTypeArray; - - hipSurfaceObject_t surface; - retErr = PI_CHECK_ERROR(hipCreateSurfaceObject(&surface, &image_res_desc)); - - auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{ - context, image_array, surface, image_desc->image_type, host_ptr}); - - if (piMemObj == nullptr) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } - - *ret_mem = piMemObj.release(); - } catch (pi_result err) { - PI_CHECK_ERROR(hipFreeArray(image_array)); - return err; - } catch (...) { - PI_CHECK_ERROR(hipFreeArray(image_array)); - return PI_ERROR_UNKNOWN; - } - return retErr; -} - -/// \TODO Not implemented -pi_result hip_piMemImageGetInfo(pi_mem image, pi_image_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - (void)image; - (void)param_name; - (void)param_value_size; - (void)param_value; - (void)param_value_size_ret; - - sycl::detail::pi::die("hip_piMemImageGetInfo not implemented"); - return {}; -} - -pi_result hip_piMemRetain(pi_mem mem) { - assert(mem != nullptr); - assert(mem->get_reference_count() > 0); - mem->increment_reference_count(); - return PI_SUCCESS; -} - /// Not used as HIP backend only creates programs from binary. /// See \ref hip_piclProgramCreateWithBinary. /// @@ -3336,6 +2772,17 @@ static pi_result commonEnqueueMemImageNDCopy( return PI_ERROR_INVALID_VALUE; } +// TODO(ur) - this is just a workaround until we port Enqueue +static std::unordered_map UrToPiMemTypeMap = { + {UR_MEM_TYPE_BUFFER, PI_MEM_TYPE_BUFFER}, + {UR_MEM_TYPE_IMAGE2D, PI_MEM_TYPE_IMAGE2D}, + {UR_MEM_TYPE_IMAGE3D, PI_MEM_TYPE_IMAGE3D}, + {UR_MEM_TYPE_IMAGE2D_ARRAY, PI_MEM_TYPE_IMAGE2D_ARRAY}, + {UR_MEM_TYPE_IMAGE1D, PI_MEM_TYPE_IMAGE1D}, + {UR_MEM_TYPE_IMAGE1D_ARRAY, PI_MEM_TYPE_IMAGE1D_ARRAY}, + {UR_MEM_TYPE_IMAGE1D_BUFFER, PI_MEM_TYPE_IMAGE1D_BUFFER}, +}; + pi_result hip_piEnqueueMemImageRead(pi_queue command_queue, pi_mem image, pi_bool blocking_read, const size_t *origin, const size_t *region, size_t row_pitch, @@ -3372,7 +2819,15 @@ pi_result hip_piEnqueueMemImageRead(pi_queue command_queue, pi_mem image, size_t byteOffsetX = origin[0] * elementByteSize * NumChannels; size_t bytesToCopy = elementByteSize * NumChannels * region[0]; - pi_mem_type imgType = image->mem_.surface_mem_.get_image_type(); + // TODO(ur) - this can be removed when porting Enqueue + auto urImgType = image->mem_.surface_mem_.get_image_type(); + pi_mem_type imgType; + if (auto search = UrToPiMemTypeMap.find(urImgType); + search != UrToPiMemTypeMap.end()) { + imgType = search->second; + } else { + return PI_ERROR_UNKNOWN; + } size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]}; @@ -3441,7 +2896,15 @@ pi_result hip_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, size_t byteOffsetX = origin[0] * elementByteSize * NumChannels; size_t bytesToCopy = elementByteSize * NumChannels * region[0]; - pi_mem_type imgType = image->mem_.surface_mem_.get_image_type(); + // TODO(ur) - this can be removed when porting Enqueue + auto urImgType = image->mem_.surface_mem_.get_image_type(); + pi_mem_type imgType; + if (auto search = UrToPiMemTypeMap.find(urImgType); + search != UrToPiMemTypeMap.end()) { + imgType = search->second; + } else { + return PI_ERROR_UNKNOWN; + } size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]}; @@ -3513,7 +2976,15 @@ pi_result hip_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, size_t srcByteOffsetX = src_origin[0] * elementByteSize * dstNumChannels; size_t bytesToCopy = elementByteSize * srcNumChannels * region[0]; - pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type(); + // TODO(ur) - this can be removed when porting Enqueue + auto urImgType = src_image->mem_.surface_mem_.get_image_type(); + pi_mem_type imgType; + if (auto search = UrToPiMemTypeMap.find(urImgType); + search != UrToPiMemTypeMap.end()) { + imgType = search->second; + } else { + return PI_ERROR_UNKNOWN; + } size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; @@ -4247,15 +3718,15 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextQueueGetNativeHandle, hip_piextQueueGetNativeHandle) _PI_CL(piextQueueCreateWithNativeHandle, hip_piextQueueCreateWithNativeHandle) // Memory - _PI_CL(piMemBufferCreate, hip_piMemBufferCreate) - _PI_CL(piMemImageCreate, hip_piMemImageCreate) - _PI_CL(piMemGetInfo, hip_piMemGetInfo) - _PI_CL(piMemImageGetInfo, hip_piMemImageGetInfo) - _PI_CL(piMemRetain, hip_piMemRetain) - _PI_CL(piMemRelease, hip_piMemRelease) - _PI_CL(piMemBufferPartition, hip_piMemBufferPartition) - _PI_CL(piextMemGetNativeHandle, hip_piextMemGetNativeHandle) - _PI_CL(piextMemCreateWithNativeHandle, hip_piextMemCreateWithNativeHandle) + _PI_CL(piMemBufferCreate, pi2ur::piMemBufferCreate) + _PI_CL(piMemImageCreate, pi2ur::piMemImageCreate) + _PI_CL(piMemGetInfo, pi2ur::piMemGetInfo) + _PI_CL(piMemImageGetInfo, pi2ur::piMemImageGetInfo) + _PI_CL(piMemRetain, pi2ur::piMemRetain) + _PI_CL(piMemRelease, pi2ur::piMemRelease) + _PI_CL(piMemBufferPartition, pi2ur::piMemBufferPartition) + _PI_CL(piextMemGetNativeHandle, pi2ur::piextMemGetNativeHandle) + _PI_CL(piextMemCreateWithNativeHandle, pi2ur::piextMemCreateWithNativeHandle) // Program _PI_CL(piProgramCreate, hip_piProgramCreate) _PI_CL(piclProgramCreateWithSource, hip_piclProgramCreateWithSource) diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index 9281228c3c85e..ce184341a8405 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -41,6 +41,7 @@ #include #include +#include #include #include "pi2ur.hpp" @@ -52,8 +53,6 @@ pi_result hip_piProgramRetain(pi_program); pi_result hip_piProgramRelease(pi_program); pi_result hip_piQueueRelease(pi_queue); pi_result hip_piQueueRetain(pi_queue); -pi_result hip_piMemRetain(pi_mem); -pi_result hip_piMemRelease(pi_mem); pi_result hip_piKernelRetain(pi_kernel); pi_result hip_piKernelRelease(pi_kernel); /// \endcond @@ -125,182 +124,8 @@ struct _pi_context : ur_context_handle_t_ { /// \brief Represents non-SVM allocations on the HIP backend. /// Keeps tracks of all mapped regions used for Map/Unmap calls. /// Only one region can be active at the same time per allocation. -struct _pi_mem { - - // TODO: Move as much shared data up as possible - using pi_context = _pi_context *; - - // Context where the memory object is accessibles - pi_context context_; - - /// Reference counting of the handler - std::atomic_uint32_t refCount_; - enum class mem_type { buffer, surface } mem_type_; - - /// A PI Memory object represents either plain memory allocations ("Buffers" - /// in OpenCL) or typed allocations ("Images" in OpenCL). - /// In HIP their API handlers are different. Whereas "Buffers" are allocated - /// as pointer-like structs, "Images" are stored in Textures or Surfaces - /// This union allows implementation to use either from the same handler. - union mem_ { - // Handler for plain, pointer-based HIP allocations - struct buffer_mem_ { - using native_type = hipDeviceptr_t; - - // If this allocation is a sub-buffer (i.e., a view on an existing - // allocation), this is the pointer to the parent handler structure - pi_mem parent_; - // HIP handler for the pointer - native_type ptr_; - - /// Pointer associated with this device on the host - void *hostPtr_; - /// Size of the allocation in bytes - size_t size_; - /// Offset of the active mapped region. - size_t mapOffset_; - /// Pointer to the active mapped region, if any - void *mapPtr_; - /// Original flags for the mapped region - pi_map_flags mapFlags_; - - /** alloc_mode - * classic: Just a normal buffer allocated on the device via hip malloc - * use_host_ptr: Use an address on the host for the device - * copy_in: The data for the device comes from the host but the host - pointer is not available later for re-use - * alloc_host_ptr: Uses pinned-memory allocation - */ - enum class alloc_mode { - classic, - use_host_ptr, - copy_in, - alloc_host_ptr - } allocMode_; - - native_type get() const noexcept { return ptr_; } - - native_type get_with_offset(size_t offset) const noexcept { - return reinterpret_cast(reinterpret_cast(ptr_) + - offset); - } - - void *get_void() const noexcept { return reinterpret_cast(ptr_); } - - size_t get_size() const noexcept { return size_; } - - void *get_map_ptr() const noexcept { return mapPtr_; } - - size_t get_map_offset(void *ptr) const noexcept { - (void)ptr; - return mapOffset_; - } - - /// Returns a pointer to data visible on the host that contains - /// the data on the device associated with this allocation. - /// The offset is used to index into the HIP allocation. - /// - void *map_to_ptr(size_t offset, pi_map_flags flags) noexcept { - assert(mapPtr_ == nullptr); - mapOffset_ = offset; - mapFlags_ = flags; - if (hostPtr_) { - mapPtr_ = static_cast(hostPtr_) + offset; - } else { - // TODO: Allocate only what is needed based on the offset - mapPtr_ = static_cast(malloc(this->get_size())); - } - return mapPtr_; - } - - /// Detach the allocation from the host memory. - void unmap(void *ptr) noexcept { - (void)ptr; - assert(mapPtr_ != nullptr); - - if (mapPtr_ != hostPtr_) { - free(mapPtr_); - } - mapPtr_ = nullptr; - mapOffset_ = 0; - } - - pi_map_flags get_map_flags() const noexcept { - assert(mapPtr_ != nullptr); - return mapFlags_; - } - } buffer_mem_; - - // Handler data for surface object (i.e. Images) - struct surface_mem_ { - hipArray *array_; - hipSurfaceObject_t surfObj_; - pi_mem_type imageType_; - - hipArray *get_array() const noexcept { return array_; } - - hipSurfaceObject_t get_surface() const noexcept { return surfObj_; } - - pi_mem_type get_image_type() const noexcept { return imageType_; } - } surface_mem_; - } mem_; - - /// Constructs the PI MEM handler for a non-typed allocation ("buffer") - _pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode, - hipDeviceptr_t ptr, void *host_ptr, size_t size) - : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer} { - mem_.buffer_mem_.ptr_ = ptr; - mem_.buffer_mem_.parent_ = parent; - mem_.buffer_mem_.hostPtr_ = host_ptr; - mem_.buffer_mem_.size_ = size; - mem_.buffer_mem_.mapOffset_ = 0; - mem_.buffer_mem_.mapPtr_ = nullptr; - mem_.buffer_mem_.mapFlags_ = PI_MAP_WRITE; - mem_.buffer_mem_.allocMode_ = mode; - if (is_sub_buffer()) { - hip_piMemRetain(mem_.buffer_mem_.parent_); - } else { - pi2ur::piContextRetain(context_); - } - }; - - /// Constructs the PI allocation for an Image object - _pi_mem(pi_context ctxt, hipArray *array, hipSurfaceObject_t surf, - pi_mem_type image_type, void *host_ptr) - : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface} { - (void)host_ptr; - mem_.surface_mem_.array_ = array; - mem_.surface_mem_.imageType_ = image_type; - mem_.surface_mem_.surfObj_ = surf; - pi2ur::piContextRetain(context_); - } - - ~_pi_mem() { - if (mem_type_ == mem_type::buffer) { - if (is_sub_buffer()) { - hip_piMemRelease(mem_.buffer_mem_.parent_); - return; - } - } - pi2ur::piContextRelease(context_); - } - - // TODO: Move as many shared funcs up as possible - bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; } - - bool is_sub_buffer() const noexcept { - return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr)); - } - - bool is_image() const noexcept { return mem_type_ == mem_type::surface; } - - pi_context get_context() const noexcept { return context_; } - - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } +struct _pi_mem : ur_mem_handle_t_ { + using ur_mem_handle_t_::ur_mem_handle_t_; }; /// PI queue mapping on to hipStream_t objects. diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 5e47ee42c2cda..fb6856a100735 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -185,6 +185,8 @@ if ("hip" IN_LIST SYCL_ENABLE_PLUGINS) "ur/adapters/hip/device.hpp" "ur/adapters/hip/platform.cpp" "ur/adapters/hip/platform.hpp" + "ur/adapters/hip/memory.cpp" + "ur/adapters/hip/memory.hpp" "ur/adapters/hip/ur_interface_loader.cpp" INCLUDE_DIRS ${sycl_inc_dir} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp index b7a1da0e48784..010b40d6b46a5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp @@ -104,4 +104,79 @@ void assertion(bool Condition, const char *Message = nullptr); } // namespace ur } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) -} // namespace sycl \ No newline at end of file +} // namespace sycl + +/// RAII object that calls the reference count release function on the held UR +/// object on destruction. +/// +/// The `dismiss` function stops the release from happening on destruction. +template class ReleaseGuard { +private: + T Captive; + + static ur_result_t callRelease(ur_device_handle_t Captive) { + return urDeviceRelease(Captive); + } + + static ur_result_t callRelease(ur_context_handle_t Captive) { + return urContextRelease(Captive); + } + + static ur_result_t callRelease(ur_mem_handle_t Captive) { + return urMemRelease(Captive); + } + + static ur_result_t callRelease(ur_program_handle_t Captive) { + return urProgramRelease(Captive); + } + + static ur_result_t callRelease(ur_kernel_handle_t Captive) { + return urKernelRelease(Captive); + } + + static ur_result_t callRelease(ur_queue_handle_t Captive) { + return urQueueRelease(Captive); + } + + static ur_result_t callRelease(ur_event_handle_t Captive) { + return urEventRelease(Captive); + } + +public: + ReleaseGuard() = delete; + /// Obj can be `nullptr`. + explicit ReleaseGuard(T Obj) : Captive(Obj) {} + ReleaseGuard(ReleaseGuard &&Other) noexcept : Captive(Other.Captive) { + Other.Captive = nullptr; + } + + ReleaseGuard(const ReleaseGuard &) = delete; + + /// Calls the related UR object release function if the object held is not + /// `nullptr` or if `dismiss` has not been called. + ~ReleaseGuard() { + if (Captive != nullptr) { + ur_result_t ret = callRelease(Captive); + if (ret != UR_RESULT_SUCCESS) { + // A reported HIP error is either an implementation or an asynchronous + // HIP error for which it is unclear if the function that reported it + // succeeded or not. Either way, the state of the program is compromised + // and likely unrecoverable. + sycl::detail::ur::die( + "Unrecoverable program state reached in piMemRelease"); + } + } + } + + ReleaseGuard &operator=(const ReleaseGuard &) = delete; + + ReleaseGuard &operator=(ReleaseGuard &&Other) { + Captive = Other.Captive; + Other.Captive = nullptr; + return *this; + } + + /// End the guard and do not release the reference count of the held + /// UR object. + void dismiss() { Captive = nullptr; } +}; \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp new file mode 100644 index 0000000000000..24dc708d3b449 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp @@ -0,0 +1,530 @@ +#include "memory.hpp" +#include "context.hpp" +#include + +/// Decreases the reference count of the Mem object. +/// If this is zero, calls the relevant HIP Free function +/// \return UR_RESULT_SUCCESS unless deallocation error +/// +UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { + UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t ret = UR_RESULT_SUCCESS; + + try { + + // Do nothing if there are other references + if (hMem->decrement_reference_count() > 0) { + return UR_RESULT_SUCCESS; + } + + // make sure memObj is released in case UR_CHECK_ERROR throws + std::unique_ptr uniqueMemObj(hMem); + + if (hMem->is_sub_buffer()) { + return UR_RESULT_SUCCESS; + } + + ScopedContext active(uniqueMemObj->get_context()); + + if (hMem->mem_type_ == ur_mem_handle_t_::mem_type::buffer) { + switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) { + case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::copy_in: + case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic: + ret = UR_CHECK_ERROR( + hipFree((void *)uniqueMemObj->mem_.buffer_mem_.ptr_)); + break; + case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::use_host_ptr: + ret = UR_CHECK_ERROR( + hipHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); + break; + case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr: + ret = UR_CHECK_ERROR( + hipFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); + }; + } + + else if (hMem->mem_type_ == ur_mem_handle_t_::mem_type::surface) { + ret = UR_CHECK_ERROR(hipDestroySurfaceObject( + uniqueMemObj->mem_.surface_mem_.get_surface())); + auto array = uniqueMemObj->mem_.surface_mem_.get_array(); + ret = UR_CHECK_ERROR(hipFreeArray(array)); + } + + } catch (ur_result_t err) { + ret = err; + } catch (...) { + ret = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + if (ret != UR_RESULT_SUCCESS) { + // A reported HIP error is either an implementation or an asynchronous HIP + // error for which it is unclear if the function that reported it succeeded + // or not. Either way, the state of the program is compromised and likely + // unrecoverable. + sycl::detail::ur::die( + "Unrecoverable program state reached in urMemRelease"); + } + + return UR_RESULT_SUCCESS; +} + +/// Creates a UR Memory object using a HIP memory allocation. +/// Can trigger a manual copy depending on the mode. +/// \TODO Implement USE_HOST_PTR using hipHostRegister +/// +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( + ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size, + const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + // Validate flags + UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, + UR_RESULT_ERROR_INVALID_ENUMERATION); + if (flags & + (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) { + UR_ASSERT(pProperties && pProperties->pHost, + UR_RESULT_ERROR_INVALID_HOST_PTR); + } + // Need input memory object + UR_ASSERT(phBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(size != 0, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + + // Currently, USE_HOST_PTR is not implemented using host register + // since this triggers a weird segfault after program ends. + // Setting this constant to true enables testing that behavior. + const bool enableUseHostPtr = false; + const bool performInitialCopy = + (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || + ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !enableUseHostPtr); + ur_result_t retErr = UR_RESULT_SUCCESS; + ur_mem_handle_t retMemObj = nullptr; + + try { + ScopedContext active(hContext); + void *ptr; + auto pHost = pProperties ? pProperties->pHost : nullptr; + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode allocMode = + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic; + + if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && enableUseHostPtr) { + retErr = + UR_CHECK_ERROR(hipHostRegister(pHost, size, hipHostRegisterMapped)); + retErr = UR_CHECK_ERROR(hipHostGetDevicePointer(&ptr, pHost, 0)); + allocMode = ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::use_host_ptr; + } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { + retErr = UR_CHECK_ERROR(hipHostMalloc(&pHost, size)); + retErr = UR_CHECK_ERROR(hipHostGetDevicePointer(&ptr, pHost, 0)); + allocMode = + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; + } else { + retErr = UR_CHECK_ERROR(hipMalloc(&ptr, size)); + if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { + allocMode = ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::copy_in; + } + } + + if (retErr == UR_RESULT_SUCCESS) { + ur_mem_handle_t parentBuffer = nullptr; + + auto devPtr = reinterpret_cast< + ur_mem_handle_t_::mem_::mem_::buffer_mem_::native_type>(ptr); + auto urMemObj = std::unique_ptr(new ur_mem_handle_t_{ + hContext, parentBuffer, flags, allocMode, devPtr, pHost, size}); + if (urMemObj != nullptr) { + retMemObj = urMemObj.release(); + if (performInitialCopy) { + // Operates on the default stream of the current HIP context. + retErr = UR_CHECK_ERROR(hipMemcpyHtoD(devPtr, pHost, size)); + // Synchronize with default stream implicitly used by hipMemcpyHtoD + // to make buffer data available on device before any other UR call + // uses it. + if (retErr == UR_RESULT_SUCCESS) { + hipStream_t defaultStream = 0; + retErr = UR_CHECK_ERROR(hipStreamSynchronize(defaultStream)); + } + } + } else { + retErr = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + } + } catch (ur_result_t err) { + retErr = err; + } catch (...) { + retErr = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + *phBuffer = retMemObj; + + return retErr; +} + +/// Implements a buffer partition in the HIP backend. +/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented +/// as an offset over an existing HIP allocation. +/// +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( + ur_mem_handle_t hBuffer, ur_mem_flags_t flags, + ur_buffer_create_type_t bufferCreateType, const ur_buffer_region_t *pRegion, + ur_mem_handle_t *phMem) { + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, + UR_RESULT_ERROR_INVALID_ENUMERATION); + UR_ASSERT(hBuffer->is_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(!hBuffer->is_sub_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + // Default value for flags means UR_MEM_FLAG_READ_WRITE. + if (flags == 0) { + flags = UR_MEM_FLAG_READ_WRITE; + } + + UR_ASSERT(!(flags & + (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | + UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)), + UR_RESULT_ERROR_INVALID_VALUE); + if (hBuffer->memFlags_ & UR_MEM_FLAG_WRITE_ONLY) { + UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_READ_ONLY)), + UR_RESULT_ERROR_INVALID_VALUE); + } + if (hBuffer->memFlags_ & UR_MEM_FLAG_READ_ONLY) { + UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)), + UR_RESULT_ERROR_INVALID_VALUE); + } + + UR_ASSERT(bufferCreateType == UR_BUFFER_CREATE_TYPE_REGION, + UR_RESULT_ERROR_INVALID_ENUMERATION); + UR_ASSERT(pRegion != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(phMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + + UR_ASSERT(((pRegion->origin + pRegion->size) <= + hBuffer->mem_.buffer_mem_.get_size()), + UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + // Retained indirectly due to retaining parent buffer below. + ur_context_handle_t context = hBuffer->context_; + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode allocMode = + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic; + + UR_ASSERT(hBuffer->mem_.buffer_mem_.ptr_ != + ur_mem_handle_t_::mem_::buffer_mem_::native_type{0}, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + ur_mem_handle_t_::mem_::buffer_mem_::native_type ptr = + hBuffer->mem_.buffer_mem_.get_with_offset(pRegion->origin); + + void *hostPtr = nullptr; + if (hBuffer->mem_.buffer_mem_.hostPtr_) { + hostPtr = static_cast(hBuffer->mem_.buffer_mem_.hostPtr_) + + pRegion->origin; + } + + ReleaseGuard releaseGuard(hBuffer); + + std::unique_ptr retMemObj{nullptr}; + try { + ScopedContext active(context); + + retMemObj = std::unique_ptr{new ur_mem_handle_t_{ + context, hBuffer, flags, allocMode, ptr, hostPtr, pRegion->size}}; + } catch (ur_result_t err) { + *phMem = nullptr; + return err; + } catch (...) { + *phMem = nullptr; + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + releaseGuard.dismiss(); + *phMem = retMemObj.release(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, + ur_mem_info_t MemInfoType, + size_t propSize, + void *pMemInfo, + size_t *pPropSizeRet) { + + UR_ASSERT(hMemory, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(MemInfoType <= UR_MEM_INFO_CONTEXT, + UR_RESULT_ERROR_INVALID_ENUMERATION); + UR_ASSERT(hMemory->is_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet); + + ScopedContext active(hMemory->get_context()); + + switch (MemInfoType) { + case UR_MEM_INFO_SIZE: { + try { + size_t allocSize = 0; + UR_CHECK_ERROR(hipMemGetAddressRange(nullptr, &allocSize, + hMemory->mem_.buffer_mem_.ptr_)); + return ReturnValue(allocSize); + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + } + case UR_MEM_INFO_CONTEXT: { + return ReturnValue(hMemory->get_context()); + } + + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } +} + +/// Gets the native HIP handle of a UR mem object +/// +/// \param[in] mem The UR mem to get the native HIP object of. +/// \param[out] nativeHandle Set to the native handle of the UR mem object. +/// +/// \return UR_RESULT_SUCCESS +UR_APIEXPORT ur_result_t UR_APICALL +urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) { +#if defined(__HIP_PLATFORM_NVIDIA__) + if (sizeof(ur_mem_handle_t_::mem_::buffer_mem_::native_type) > + sizeof(ur_native_handle_t)) { + // Check that all the upper bits that cannot be represented by + // ur_native_handle_t are empty. + // NOTE: The following shift might trigger a warning, but the check in the + // if above makes sure that this does not underflow. + ur_mem_handle_t_::mem_::buffer_mem_::native_type upperBits = + hMem->mem_.buffer_mem_.get() >> (sizeof(ur_native_handle_t) * CHAR_BIT); + if (upperBits) { + // Return an error if any of the remaining bits is non-zero. + return UR_RESULT_ERROR_INVALID_MEM_OBJECT; + } + } + *phNativeMem = static_cast(hMem->mem_.buffer_mem_.get()); +#elif defined(__HIP_PLATFORM_AMD__) + *phNativeMem = + reinterpret_cast(hMem->mem_.buffer_mem_.get()); +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( + ur_native_handle_t hNativeMem, ur_context_handle_t hContext, + const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) { + std::ignore = hNativeMem; + std::ignore = hContext; + std::ignore = pProperties; + std::ignore = phMem; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( + ur_native_handle_t hNativeMem, ur_context_handle_t hContext, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) { + std::ignore = hNativeMem; + std::ignore = hContext; + std::ignore = pImageFormat; + std::ignore = pImageDesc; + std::ignore = pProperties; + std::ignore = phMem; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +/// \TODO Not implemented +UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( + ur_context_handle_t hContext, ur_mem_flags_t flags, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + void *pHost, ur_mem_handle_t *phMem) { + + // Need input memory object + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(pImageDesc, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, + UR_RESULT_ERROR_INVALID_ENUMERATION); + if (flags & + (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)) { + UR_ASSERT(pHost, UR_RESULT_ERROR_INVALID_HOST_PTR); + } + + const bool performInitialCopy = + (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || + ((flags & UR_MEM_FLAG_USE_HOST_POINTER)); + + UR_ASSERT(pImageDesc->stype == UR_STRUCTURE_TYPE_IMAGE_DESC, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_BUFFER, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pImageDesc->numMipLevel == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pImageDesc->numSamples == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + if (!pHost) { + UR_ASSERT(pImageDesc->rowPitch == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pImageDesc->slicePitch == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + } + + ur_result_t retErr = UR_RESULT_SUCCESS; + + // We only support RBGA channel order + // TODO: check SYCL CTS and spec. May also have to support BGRA + UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA, + UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); + + // We have to use hipArray3DCreate, which has some caveats. The height and + // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives + // a minimum value of 1, so we need to convert the answer. + HIP_ARRAY3D_DESCRIPTOR array_desc; + array_desc.NumChannels = 4; // Only support 4 channel image + array_desc.Flags = 0; // No flags required + array_desc.Width = pImageDesc->width; + if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { + array_desc.Height = 0; + array_desc.Depth = 0; + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { + array_desc.Height = pImageDesc->height; + array_desc.Depth = 0; + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { + array_desc.Height = pImageDesc->height; + array_desc.Depth = pImageDesc->depth; + } + + // We need to get this now in bytes for calculating the total image size later + size_t pixel_type_size_bytes; + + switch (pImageFormat->channelType) { + + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: + array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT8; + pixel_type_size_bytes = 1; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: + array_desc.Format = HIP_AD_FORMAT_SIGNED_INT8; + pixel_type_size_bytes = 1; + break; + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: + array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT16; + pixel_type_size_bytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: + array_desc.Format = HIP_AD_FORMAT_SIGNED_INT16; + pixel_type_size_bytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: + array_desc.Format = HIP_AD_FORMAT_HALF; + pixel_type_size_bytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: + array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT32; + pixel_type_size_bytes = 4; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: + array_desc.Format = HIP_AD_FORMAT_SIGNED_INT32; + pixel_type_size_bytes = 4; + break; + case UR_IMAGE_CHANNEL_TYPE_FLOAT: + array_desc.Format = HIP_AD_FORMAT_FLOAT; + pixel_type_size_bytes = 4; + break; + default: + // urMemImageCreate given unsupported image_channel_data_type + return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR; + } + + // When a dimension isn't used image_desc has the size set to 1 + size_t pixel_size_bytes = + pixel_type_size_bytes * 4; // 4 is the only number of channels we support + size_t image_size_bytes = pixel_size_bytes * pImageDesc->width * + pImageDesc->height * pImageDesc->depth; + + ScopedContext active(hContext); + hipArray *image_array; + retErr = UR_CHECK_ERROR(hipArray3DCreate( + reinterpret_cast(&image_array), &array_desc)); + + try { + if (performInitialCopy) { + // We have to use a different copy function for each image dimensionality + if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { + retErr = UR_CHECK_ERROR( + hipMemcpyHtoA(image_array, 0, pHost, image_size_bytes)); + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { + hip_Memcpy2D cpy_desc; + memset(&cpy_desc, 0, sizeof(cpy_desc)); + cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost; + cpy_desc.srcHost = pHost; + cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray; + cpy_desc.dstArray = reinterpret_cast(image_array); + cpy_desc.WidthInBytes = pixel_size_bytes * pImageDesc->width; + cpy_desc.Height = pImageDesc->height; + retErr = UR_CHECK_ERROR(hipMemcpyParam2D(&cpy_desc)); + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { + HIP_MEMCPY3D cpy_desc; + memset(&cpy_desc, 0, sizeof(cpy_desc)); + cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost; + cpy_desc.srcHost = pHost; + cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray; + cpy_desc.dstArray = reinterpret_cast(image_array); + cpy_desc.WidthInBytes = pixel_size_bytes * pImageDesc->width; + cpy_desc.Height = pImageDesc->height; + cpy_desc.Depth = pImageDesc->depth; + retErr = UR_CHECK_ERROR(hipDrvMemcpy3D(&cpy_desc)); + } + } + + // HIP_RESOURCE_DESC is a union of different structs, shown here + // We need to fill it as described here to use it for a surface or texture + // HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and + // HIP_RESOURCE_DESC::res::array::hArray must be set to a valid HIP array + // handle. + // HIP_RESOURCE_DESC::flags must be set to zero + + hipResourceDesc image_res_desc; + image_res_desc.res.array.array = image_array; + image_res_desc.resType = hipResourceTypeArray; + + hipSurfaceObject_t surface; + retErr = UR_CHECK_ERROR(hipCreateSurfaceObject(&surface, &image_res_desc)); + + auto urMemObj = std::unique_ptr(new ur_mem_handle_t_{ + hContext, image_array, surface, flags, pImageDesc->type, pHost}); + + if (urMemObj == nullptr) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + *phMem = urMemObj.release(); + } catch (ur_result_t err) { + UR_CHECK_ERROR(hipFreeArray(image_array)); + return err; + } catch (...) { + UR_CHECK_ERROR(hipFreeArray(image_array)); + return UR_RESULT_ERROR_UNKNOWN; + } + return retErr; +} + +/// \TODO Not implemented +UR_APIEXPORT ur_result_t UR_APICALL +urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t ImgInfoType, + size_t propSize, void *pImgInfo, size_t *pPropSizeRet) { + std::ignore = hMemory; + std::ignore = ImgInfoType; + std::ignore = propSize; + std::ignore = pImgInfo; + std::ignore = pPropSizeRet; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { + UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hMem->get_reference_count() > 0, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + hMem->increment_reference_count(); + return UR_RESULT_SUCCESS; +} \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp new file mode 100644 index 0000000000000..9403fc565dfe8 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp @@ -0,0 +1,201 @@ +//===--------- context.cpp - HIP Adapter ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include "common.hpp" +#include + +/// UR Mem mapping to HIP memory allocations, both data and texture/surface. +/// \brief Represents non-SVM allocations on the HIP backend. +/// Keeps tracks of all mapped regions used for Map/Unmap calls. +/// Only one region can be active at the same time per allocation. +struct ur_mem_handle_t_ { + + // TODO: Move as much shared data up as possible + using ur_context = ur_context_handle_t_ *; + using ur_mem = ur_mem_handle_t_ *; + + // Context where the memory object is accessibles + ur_context context_; + + /// Reference counting of the handler + std::atomic_uint32_t refCount_; + enum class mem_type { buffer, surface } mem_type_; + + // Original mem flags passed + ur_mem_flags_t memFlags_; + + /// A UR Memory object represents either plain memory allocations ("Buffers" + /// in OpenCL) or typed allocations ("Images" in OpenCL). + /// In HIP their API handlers are different. Whereas "Buffers" are allocated + /// as pointer-like structs, "Images" are stored in Textures or Surfaces + /// This union allows implementation to use either from the same handler. + union mem_ { + // Handler for plain, pointer-based HIP allocations + struct buffer_mem_ { + using native_type = hipDeviceptr_t; + + // If this allocation is a sub-buffer (i.e., a view on an existing + // allocation), this is the pointer to the parent handler structure + ur_mem parent_; + // HIP handler for the pointer + native_type ptr_; + + /// Pointer associated with this device on the host + void *hostPtr_; + /// Size of the allocation in bytes + size_t size_; + /// Offset of the active mapped region. + size_t mapOffset_; + /// Pointer to the active mapped region, if any + void *mapPtr_; + /// Original flags for the mapped region + ur_map_flags_t mapFlags_; + + /** alloc_mode + * classic: Just a normal buffer allocated on the device via hip malloc + * use_host_ptr: Use an address on the host for the device + * copy_in: The data for the device comes from the host but the host + pointer is not available later for re-use + * alloc_host_ptr: Uses pinned-memory allocation + */ + enum class alloc_mode { + classic, + use_host_ptr, + copy_in, + alloc_host_ptr + } allocMode_; + + native_type get() const noexcept { return ptr_; } + + native_type get_with_offset(size_t offset) const noexcept { + return reinterpret_cast(reinterpret_cast(ptr_) + + offset); + } + + void *get_void() const noexcept { return reinterpret_cast(ptr_); } + + size_t get_size() const noexcept { return size_; } + + void *get_map_ptr() const noexcept { return mapPtr_; } + + size_t get_map_offset(void *ptr) const noexcept { + (void)ptr; + return mapOffset_; + } + + /// Returns a pointer to data visible on the host that contains + /// the data on the device associated with this allocation. + /// The offset is used to index into the HIP allocation. + /// + void *map_to_ptr(size_t offset, ur_map_flags_t flags) noexcept { + assert(mapPtr_ == nullptr); + mapOffset_ = offset; + mapFlags_ = flags; + if (hostPtr_) { + mapPtr_ = static_cast(hostPtr_) + offset; + } else { + // TODO: Allocate only what is needed based on the offset + mapPtr_ = static_cast(malloc(this->get_size())); + } + return mapPtr_; + } + + /// Detach the allocation from the host memory. + void unmap(void *ptr) noexcept { + (void)ptr; + assert(mapPtr_ != nullptr); + + if (mapPtr_ != hostPtr_) { + free(mapPtr_); + } + mapPtr_ = nullptr; + mapOffset_ = 0; + } + + ur_map_flags_t get_map_flags() const noexcept { + assert(mapPtr_ != nullptr); + return mapFlags_; + } + } buffer_mem_; + + // Handler data for surface object (i.e. Images) + struct surface_mem_ { + hipArray *array_; + hipSurfaceObject_t surfObj_; + ur_mem_type_t imageType_; + + hipArray *get_array() const noexcept { return array_; } + + hipSurfaceObject_t get_surface() const noexcept { return surfObj_; } + + ur_mem_type_t get_image_type() const noexcept { return imageType_; } + } surface_mem_; + } mem_; + + /// Constructs the UR MEM handler for a non-typed allocation ("buffer") + ur_mem_handle_t_(ur_context ctxt, ur_mem parent, ur_mem_flags_t mem_flags, + mem_::buffer_mem_::alloc_mode mode, hipDeviceptr_t ptr, + void *host_ptr, size_t size) + : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer}, + memFlags_{mem_flags} { + mem_.buffer_mem_.ptr_ = ptr; + mem_.buffer_mem_.parent_ = parent; + mem_.buffer_mem_.hostPtr_ = host_ptr; + mem_.buffer_mem_.size_ = size; + mem_.buffer_mem_.mapOffset_ = 0; + mem_.buffer_mem_.mapPtr_ = nullptr; + mem_.buffer_mem_.mapFlags_ = UR_MAP_FLAG_WRITE; + mem_.buffer_mem_.allocMode_ = mode; + if (is_sub_buffer()) { + urMemRetain(mem_.buffer_mem_.parent_); + } else { + urContextRetain(context_); + } + }; + + /// Constructs the UR allocation for an Image object + ur_mem_handle_t_(ur_context ctxt, hipArray *array, hipSurfaceObject_t surf, + ur_mem_flags_t mem_flags, ur_mem_type_t image_type, + void *host_ptr) + : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface}, + memFlags_{mem_flags} { + (void)host_ptr; + mem_.surface_mem_.array_ = array; + mem_.surface_mem_.imageType_ = image_type; + mem_.surface_mem_.surfObj_ = surf; + urContextRetain(context_); + } + + ~ur_mem_handle_t_() { + if (mem_type_ == mem_type::buffer) { + if (is_sub_buffer()) { + urMemRelease(mem_.buffer_mem_.parent_); + return; + } + } + urContextRelease(context_); + } + + // TODO: Move as many shared funcs up as possible + bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; } + + bool is_sub_buffer() const noexcept { + return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr)); + } + + bool is_image() const noexcept { return mem_type_ == mem_type::surface; } + + ur_context get_context() const noexcept { return context_; } + + uint32_t increment_reference_count() noexcept { return ++refCount_; } + + uint32_t decrement_reference_count() noexcept { return --refCount_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index 4add49dbf4fe1..96dcbf54eccb3 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -144,16 +144,17 @@ urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) { if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnBufferCreate = nullptr; - pDdiTable->pfnBufferPartition = nullptr; - pDdiTable->pfnBufferCreateWithNativeHandle = nullptr; - pDdiTable->pfnImageCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnImageCreate = nullptr; - pDdiTable->pfnImageGetInfo = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnBufferCreate = urMemBufferCreate; + pDdiTable->pfnBufferPartition = urMemBufferPartition; + pDdiTable->pfnBufferCreateWithNativeHandle = + urMemBufferCreateWithNativeHandle; + pDdiTable->pfnImageCreateWithNativeHandle = urMemImageCreateWithNativeHandle; + pDdiTable->pfnGetInfo = urMemGetInfo; + pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle; + pDdiTable->pfnImageCreate = urMemImageCreate; + pDdiTable->pfnImageGetInfo = urMemImageGetInfo; + pDdiTable->pfnRelease = urMemRelease; + pDdiTable->pfnRetain = urMemRetain; return UR_RESULT_SUCCESS; } From 9e239f7fbd216419bd72fbec693a55135d6e2298 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Tue, 16 May 2023 12:42:35 +0100 Subject: [PATCH 05/42] [SYCL][PI][UR][HIP] Port usm and sampler entry-points to UR --- sycl/plugins/hip/CMakeLists.txt | 3 + sycl/plugins/hip/pi_hip.cpp | 354 +----------------- sycl/plugins/hip/pi_hip.hpp | 16 +- sycl/plugins/unified_runtime/CMakeLists.txt | 3 + .../ur/adapters/hip/memory.cpp | 3 +- .../ur/adapters/hip/sampler.cpp | 84 +++++ .../ur/adapters/hip/sampler.hpp | 31 ++ .../ur/adapters/hip/ur_interface_loader.cpp | 18 +- .../unified_runtime/ur/adapters/hip/usm.cpp | 231 ++++++++++++ 9 files changed, 375 insertions(+), 368 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/sampler.hpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt index aa8500ed0a92d..2344ddcecff15 100644 --- a/sycl/plugins/hip/CMakeLists.txt +++ b/sycl/plugins/hip/CMakeLists.txt @@ -102,6 +102,9 @@ add_sycl_plugin(hip "../unified_runtime/ur/adapters/hip/platform.hpp" "../unified_runtime/ur/adapters/hip/memory.cpp" "../unified_runtime/ur/adapters/hip/memory.hpp" + "../unified_runtime/ur/adapters/hip/sampler.cpp" + "../unified_runtime/ur/adapters/hip/sampler.hpp" + "../unified_runtime/ur/adapters/hip/usm.cpp" "../unified_runtime/ur/adapters/hip/ur_interface_loader.cpp" "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index b29336d19a924..2b16e90c04472 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -2177,144 +2177,6 @@ pi_result hip_piextEventCreateWithNativeHandle(pi_native_handle nativeHandle, return {}; } -/// Creates a PI sampler object -/// -/// \param[in] context The context the sampler is created for. -/// \param[in] sampler_properties The properties for the sampler. -/// \param[out] result_sampler Set to the resulting sampler object. -/// -/// \return PI_SUCCESS on success. PI_ERROR_INVALID_VALUE if given an invalid -/// property -/// or if there is multiple of properties from the same category. -pi_result hip_piSamplerCreate(pi_context context, - const pi_sampler_properties *sampler_properties, - pi_sampler *result_sampler) { - std::unique_ptr<_pi_sampler> retImplSampl{new _pi_sampler(context)}; - - bool propSeen[3] = {false, false, false}; - for (size_t i = 0; sampler_properties[i] != 0; i += 2) { - switch (sampler_properties[i]) { - case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS: - if (propSeen[0]) { - return PI_ERROR_INVALID_VALUE; - } - propSeen[0] = true; - retImplSampl->props_ |= sampler_properties[i + 1]; - break; - case PI_SAMPLER_PROPERTIES_FILTER_MODE: - if (propSeen[1]) { - return PI_ERROR_INVALID_VALUE; - } - propSeen[1] = true; - retImplSampl->props_ |= - (sampler_properties[i + 1] - PI_SAMPLER_FILTER_MODE_NEAREST) << 1; - break; - case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE: - if (propSeen[2]) { - return PI_ERROR_INVALID_VALUE; - } - propSeen[2] = true; - retImplSampl->props_ |= - (sampler_properties[i + 1] - PI_SAMPLER_ADDRESSING_MODE_NONE) << 2; - break; - default: - return PI_ERROR_INVALID_VALUE; - } - } - - if (!propSeen[0]) { - retImplSampl->props_ |= PI_TRUE; - } - // Default filter mode to CL_FILTER_NEAREST - if (!propSeen[2]) { - retImplSampl->props_ |= - (PI_SAMPLER_ADDRESSING_MODE_CLAMP % PI_SAMPLER_ADDRESSING_MODE_NONE) - << 2; - } - - *result_sampler = retImplSampl.release(); - return PI_SUCCESS; -} - -/// Gets information from a PI sampler object -/// -/// \param[in] sampler The sampler to get the information from. -/// \param[in] param_name The name of the information to get. -/// \param[in] param_value_size The size of the param_value. -/// \param[out] param_value Set to information value. -/// \param[out] param_value_size_ret Set to the size of the information value. -/// -/// \return PI_SUCCESS on success. -pi_result hip_piSamplerGetInfo(pi_sampler sampler, pi_sampler_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - assert(sampler != nullptr); - - switch (param_name) { - case PI_SAMPLER_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, param_value_size_ret, - sampler->get_reference_count()); - case PI_SAMPLER_INFO_CONTEXT: - return getInfo(param_value_size, param_value, param_value_size_ret, - sampler->context_); - case PI_SAMPLER_INFO_NORMALIZED_COORDS: { - pi_bool norm_coords_prop = static_cast(sampler->props_ & 0x1); - return getInfo(param_value_size, param_value, param_value_size_ret, - norm_coords_prop); - } - case PI_SAMPLER_INFO_FILTER_MODE: { - pi_sampler_filter_mode filter_prop = static_cast( - ((sampler->props_ >> 1) & 0x1) + PI_SAMPLER_FILTER_MODE_NEAREST); - return getInfo(param_value_size, param_value, param_value_size_ret, - filter_prop); - } - case PI_SAMPLER_INFO_ADDRESSING_MODE: { - pi_sampler_addressing_mode addressing_prop = - static_cast( - (sampler->props_ >> 2) + PI_SAMPLER_ADDRESSING_MODE_NONE); - return getInfo(param_value_size, param_value, param_value_size_ret, - addressing_prop); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - return {}; -} - -/// Retains a PI sampler object, incrementing its reference count. -/// -/// \param[in] sampler The sampler to increment the reference count of. -/// -/// \return PI_SUCCESS. -pi_result hip_piSamplerRetain(pi_sampler sampler) { - assert(sampler != nullptr); - sampler->increment_reference_count(); - return PI_SUCCESS; -} - -/// Releases a PI sampler object, decrementing its reference count. If the -/// reference count reaches zero, the sampler object is destroyed. -/// -/// \param[in] sampler The sampler to decrement the reference count of. -/// -/// \return PI_SUCCESS. -pi_result hip_piSamplerRelease(pi_sampler sampler) { - assert(sampler != nullptr); - - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. - sycl::detail::pi::assertion( - sampler->get_reference_count() != 0, - "Reference count overflow detected in hip_piSamplerRelease."); - - // decrement ref count. If it is 0, delete the sampler. - if (sampler->decrement_reference_count() == 0) { - delete sampler; - } - - return PI_SUCCESS; -} - /// General 3D memory copy operation. /// This function requires the corresponding HIP context to be at the top of /// the context stack @@ -3152,106 +3014,6 @@ pi_result hip_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, return ret_err; } -/// USM: Implements USM Host allocations using HIP Pinned Memory -/// -pi_result -hip_piextUSMHostAlloc(void **result_ptr, pi_context context, - [[maybe_unused]] pi_usm_mem_properties *properties, - size_t size, [[maybe_unused]] pi_uint32 alignment) { - assert(result_ptr != nullptr); - assert(context != nullptr); - assert(properties == nullptr || *properties == 0); - pi_result result = PI_SUCCESS; - try { - ScopedContext active(context); - result = PI_CHECK_ERROR(hipHostMalloc(result_ptr, size)); - } catch (pi_result error) { - result = error; - } - - assert(alignment == 0 || - (result == PI_SUCCESS && - reinterpret_cast(*result_ptr) % alignment == 0)); - return result; -} - -/// USM: Implements USM device allocations using a normal HIP device pointer -/// -pi_result -hip_piextUSMDeviceAlloc(void **result_ptr, pi_context context, - [[maybe_unused]] pi_device device, - [[maybe_unused]] pi_usm_mem_properties *properties, - size_t size, [[maybe_unused]] pi_uint32 alignment) { - assert(result_ptr != nullptr); - assert(context != nullptr); - assert(device != nullptr); - assert(properties == nullptr || *properties == 0); - pi_result result = PI_SUCCESS; - try { - ScopedContext active(context); - result = PI_CHECK_ERROR(hipMalloc(result_ptr, size)); - } catch (pi_result error) { - result = error; - } - - assert(alignment == 0 || - (result == PI_SUCCESS && - reinterpret_cast(*result_ptr) % alignment == 0)); - return result; -} - -/// USM: Implements USM Shared allocations using HIP Managed Memory -/// -pi_result -hip_piextUSMSharedAlloc(void **result_ptr, pi_context context, - [[maybe_unused]] pi_device device, - [[maybe_unused]] pi_usm_mem_properties *properties, - size_t size, [[maybe_unused]] pi_uint32 alignment) { - assert(result_ptr != nullptr); - assert(context != nullptr); - assert(device != nullptr); - assert(properties == nullptr || *properties == 0); - pi_result result = PI_SUCCESS; - try { - ScopedContext active(context); - result = - PI_CHECK_ERROR(hipMallocManaged(result_ptr, size, hipMemAttachGlobal)); - } catch (pi_result error) { - result = error; - } - - assert(alignment == 0 || - (result == PI_SUCCESS && - reinterpret_cast(*result_ptr) % alignment == 0)); - return result; -} - -/// USM: Frees the given USM pointer associated with the context. -/// -pi_result hip_piextUSMFree(pi_context context, void *ptr) { - - assert(context != nullptr); - pi_result result = PI_SUCCESS; - try { - ScopedContext active(context); - unsigned int type; - hipPointerAttribute_t hipPointerAttributeType; - result = - PI_CHECK_ERROR(hipPointerGetAttributes(&hipPointerAttributeType, ptr)); - type = hipPointerAttributeType.memoryType; - assert(type == hipMemoryTypeDevice or type == hipMemoryTypeHost); - if (type == hipMemoryTypeDevice) { - result = PI_CHECK_ERROR(hipFree(ptr)); - } - if (type == hipMemoryTypeHost) { - result = PI_CHECK_ERROR(hipFreeHost(ptr)); - } - } catch (pi_result error) { - result = error; - } - return result; -} - pi_result hip_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, size_t count, pi_uint32 num_events_in_waitlist, @@ -3455,104 +3217,6 @@ pi_result hip_piextUSMEnqueueMemcpy2D(pi_queue queue, pi_bool blocking, return result; } -/// API to query information about USM allocated pointers -/// Valid Queries: -/// PI_MEM_ALLOC_TYPE returns host/device/shared pi_host_usm value -/// PI_MEM_ALLOC_BASE_PTR returns the base ptr of an allocation if -/// the queried pointer fell inside an allocation. -/// Result must fit in void * -/// PI_MEM_ALLOC_SIZE returns how big the queried pointer's -/// allocation is in bytes. Result is a size_t. -/// PI_MEM_ALLOC_DEVICE returns the pi_device this was allocated against -/// -/// \param context is the pi_context -/// \param ptr is the pointer to query -/// \param param_name is the type of query to perform -/// \param param_value_size is the size of the result in bytes -/// \param param_value is the result -/// \param param_value_ret is how many bytes were written -pi_result hip_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, - pi_mem_alloc_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) { - - assert(context != nullptr); - assert(ptr != nullptr); - pi_result result = PI_SUCCESS; - hipPointerAttribute_t hipPointerAttributeType; - - try { - ScopedContext active(context); - switch (param_name) { - case PI_MEM_ALLOC_TYPE: { - unsigned int value; - // do not throw if hipPointerGetAttribute returns hipErrorInvalidValue - hipError_t ret = hipPointerGetAttributes(&hipPointerAttributeType, ptr); - if (ret == hipErrorInvalidValue) { - // pointer not known to the HIP subsystem - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_MEM_TYPE_UNKNOWN); - } - result = check_error(ret, __func__, __LINE__ - 5, __FILE__); - value = hipPointerAttributeType.isManaged; - if (value) { - // pointer to managed memory - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_MEM_TYPE_SHARED); - } - result = PI_CHECK_ERROR( - hipPointerGetAttributes(&hipPointerAttributeType, ptr)); - value = hipPointerAttributeType.memoryType; - assert(value == hipMemoryTypeDevice or value == hipMemoryTypeHost); - if (value == hipMemoryTypeDevice) { - // pointer to device memory - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_MEM_TYPE_DEVICE); - } - if (value == hipMemoryTypeHost) { - // pointer to host memory - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_MEM_TYPE_HOST); - } - // should never get here - __builtin_unreachable(); - return getInfo(param_value_size, param_value, param_value_size_ret, - PI_MEM_TYPE_UNKNOWN); - } - case PI_MEM_ALLOC_BASE_PTR: { - return PI_ERROR_INVALID_VALUE; - } - case PI_MEM_ALLOC_SIZE: { - return PI_ERROR_INVALID_VALUE; - } - - case PI_MEM_ALLOC_DEVICE: { - // get device index associated with this pointer - result = PI_CHECK_ERROR( - hipPointerGetAttributes(&hipPointerAttributeType, ptr)); - int device_idx = hipPointerAttributeType.device; - - // currently each device is in its own platform, so find the platform at - // the same index - std::vector platforms; - platforms.resize(device_idx + 1); - result = pi2ur::piPlatformsGet(device_idx + 1, platforms.data(), nullptr); - - // get the device from the platform - pi_device device = - reinterpret_cast(platforms[device_idx]->devices_[0].get()); - return getInfo(param_value_size, param_value, param_value_size_ret, - device); - } - } - } catch (pi_result error) { - result = error; - } - - return result; -} - pi_result hip_piextEnqueueDeviceGlobalVariableWrite( pi_queue queue, pi_program program, const char *name, pi_bool blocking_write, size_t count, size_t offset, const void *src, @@ -3765,10 +3429,10 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextEventGetNativeHandle, hip_piextEventGetNativeHandle) _PI_CL(piextEventCreateWithNativeHandle, hip_piextEventCreateWithNativeHandle) // Sampler - _PI_CL(piSamplerCreate, hip_piSamplerCreate) - _PI_CL(piSamplerGetInfo, hip_piSamplerGetInfo) - _PI_CL(piSamplerRetain, hip_piSamplerRetain) - _PI_CL(piSamplerRelease, hip_piSamplerRelease) + _PI_CL(piSamplerCreate, pi2ur::piSamplerCreate) + _PI_CL(piSamplerGetInfo, pi2ur::piSamplerGetInfo) + _PI_CL(piSamplerRetain, pi2ur::piSamplerRetain) + _PI_CL(piSamplerRelease, pi2ur::piSamplerRelease) // Queue commands _PI_CL(piEnqueueKernelLaunch, hip_piEnqueueKernelLaunch) _PI_CL(piEnqueueNativeKernel, hip_piEnqueueNativeKernel) @@ -3788,10 +3452,10 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piEnqueueMemBufferMap, hip_piEnqueueMemBufferMap) _PI_CL(piEnqueueMemUnmap, hip_piEnqueueMemUnmap) // USM - _PI_CL(piextUSMHostAlloc, hip_piextUSMHostAlloc) - _PI_CL(piextUSMDeviceAlloc, hip_piextUSMDeviceAlloc) - _PI_CL(piextUSMSharedAlloc, hip_piextUSMSharedAlloc) - _PI_CL(piextUSMFree, hip_piextUSMFree) + _PI_CL(piextUSMHostAlloc, pi2ur::piextUSMHostAlloc) + _PI_CL(piextUSMDeviceAlloc, pi2ur::piextUSMDeviceAlloc) + _PI_CL(piextUSMSharedAlloc, pi2ur::piextUSMSharedAlloc) + _PI_CL(piextUSMFree, pi2ur::piextUSMFree) _PI_CL(piextUSMEnqueueMemset, hip_piextUSMEnqueueMemset) _PI_CL(piextUSMEnqueueMemcpy, hip_piextUSMEnqueueMemcpy) _PI_CL(piextUSMEnqueuePrefetch, hip_piextUSMEnqueuePrefetch) @@ -3799,7 +3463,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextUSMEnqueueMemcpy2D, hip_piextUSMEnqueueMemcpy2D) _PI_CL(piextUSMEnqueueFill2D, hip_piextUSMEnqueueFill2D) _PI_CL(piextUSMEnqueueMemset2D, hip_piextUSMEnqueueMemset2D) - _PI_CL(piextUSMGetMemAllocInfo, hip_piextUSMGetMemAllocInfo) + _PI_CL(piextUSMGetMemAllocInfo, pi2ur::piextUSMGetMemAllocInfo) // Device global variable _PI_CL(piextEnqueueDeviceGlobalVariableWrite, hip_piextEnqueueDeviceGlobalVariableWrite) diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index ce184341a8405..7d8f8adcf6bd9 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -43,6 +43,7 @@ #include #include #include +#include #include "pi2ur.hpp" @@ -695,19 +696,8 @@ struct _pi_kernel { /// Sampler property layout: /// | 31 30 ... 6 5 | 4 3 2 | 1 | 0 | /// | N/A | addressing mode | fiter mode | normalize coords | -struct _pi_sampler { - std::atomic_uint32_t refCount_; - pi_uint32 props_; - pi_context context_; - - _pi_sampler(pi_context context) - : refCount_(1), props_(0), context_(context) {} - - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } +struct _pi_sampler : ur_sampler_handle_t_ { + using ur_sampler_handle_t_::ur_sampler_handle_t_; }; // ------------------------------------------------------------- diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index fb6856a100735..2d2673978c927 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -187,6 +187,9 @@ if ("hip" IN_LIST SYCL_ENABLE_PLUGINS) "ur/adapters/hip/platform.hpp" "ur/adapters/hip/memory.cpp" "ur/adapters/hip/memory.hpp" + "ur/adapters/hip/sampler.cpp" + "ur/adapters/hip/sampler.hpp" + "ur/adapters/hip/usm.cpp" "ur/adapters/hip/ur_interface_loader.cpp" INCLUDE_DIRS ${sycl_inc_dir} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp index 24dc708d3b449..8be8035ec0acf 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp @@ -297,7 +297,8 @@ urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) { return UR_RESULT_ERROR_INVALID_MEM_OBJECT; } } - *phNativeMem = static_cast(hMem->mem_.buffer_mem_.get()); + *phNativeMem = + reinterpret_cast(hMem->mem_.buffer_mem_.get()); #elif defined(__HIP_PLATFORM_AMD__) *phNativeMem = reinterpret_cast(hMem->mem_.buffer_mem_.get()); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp new file mode 100644 index 0000000000000..151400c4a6128 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp @@ -0,0 +1,84 @@ +//===--------- sampler.cpp - HIP Adapter ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "sampler.hpp" +#include "common.hpp" + +ur_result_t urSamplerCreate(ur_context_handle_t hContext, + const ur_sampler_desc_t *pDesc, + ur_sampler_handle_t *phSampler) { + std::unique_ptr retImplSampl{ + new ur_sampler_handle_t_(hContext)}; + + if (pDesc && pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) { + retImplSampl->props_ |= pDesc->normalizedCoords; + retImplSampl->props_ |= (pDesc->filterMode << 1); + retImplSampl->props_ |= (pDesc->addressingMode << 2); + } else { + // Set default values + retImplSampl->props_ |= true; // Normalized Coords + retImplSampl->props_ |= UR_SAMPLER_ADDRESSING_MODE_CLAMP << 2; + } + + *phSampler = retImplSampl.release(); + return UR_RESULT_SUCCESS; +} + +ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, + ur_sampler_info_t propName, size_t propValueSize, + void *pPropValue, size_t *pPropSizeRet) { + UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_SAMPLER_INFO_REFERENCE_COUNT: + return ReturnValue(hSampler->get_reference_count()); + case UR_SAMPLER_INFO_CONTEXT: + return ReturnValue(hSampler->context_); + case UR_SAMPLER_INFO_NORMALIZED_COORDS: { + bool norm_coords_prop = static_cast(hSampler->props_); + return ReturnValue(norm_coords_prop); + } + case UR_SAMPLER_INFO_FILTER_MODE: { + auto filter_prop = + static_cast(((hSampler->props_ >> 1) & 0x1)); + return ReturnValue(filter_prop); + } + case UR_SAMPLER_INFO_ADDRESSING_MODE: { + auto addressing_prop = + static_cast(hSampler->props_ >> 2); + return ReturnValue(addressing_prop); + } + default: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + return {}; +} + +ur_result_t urSamplerRetain(ur_sampler_handle_t hSampler) { + UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + hSampler->increment_reference_count(); + return UR_RESULT_SUCCESS; +} + +ur_result_t urSamplerRelease(ur_sampler_handle_t hSampler) { + UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + sycl::detail::ur::assertion( + hSampler->get_reference_count() != 0, + "Reference count overflow detected in urSamplerRelease."); + + // decrement ref count. If it is 0, delete the sampler. + if (hSampler->decrement_reference_count() == 0) { + delete hSampler; + } + + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.hpp new file mode 100644 index 0000000000000..3d0a3059e61fa --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.hpp @@ -0,0 +1,31 @@ +//===--------- sampler.hpp - HIP Adapter ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include + +#include "context.hpp" + +/// Implementation of samplers for HIP +/// +/// Sampler property layout: +/// | 31 30 ... 6 5 | 4 3 2 | 1 | 0 | +/// | N/A | addressing mode | fiter mode | normalize coords | +struct ur_sampler_handle_t_ : _ur_object { + std::atomic_uint32_t refCount_; + uint32_t props_; + ur_context_handle_t context_; + + ur_sampler_handle_t_(ur_context_handle_t context) + : refCount_(1), props_(0), context_(context) {} + + uint32_t increment_reference_count() noexcept { return ++refCount_; } + + uint32_t decrement_reference_count() noexcept { return --refCount_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index 96dcbf54eccb3..49fd29262db78 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -129,12 +129,12 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnCreate = urSamplerCreate; pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetInfo = urSamplerGetInfo; pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnRelease = urSamplerRelease; + pDdiTable->pfnRetain = urSamplerRetain; return UR_RESULT_SUCCESS; } @@ -225,14 +225,14 @@ urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) { if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnDeviceAlloc = nullptr; - pDdiTable->pfnFree = nullptr; - pDdiTable->pfnGetMemAllocInfo = nullptr; - pDdiTable->pfnHostAlloc = nullptr; + pDdiTable->pfnDeviceAlloc = urUSMDeviceAlloc; + pDdiTable->pfnFree = urUSMFree; + pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo; + pDdiTable->pfnHostAlloc = urUSMHostAlloc; pDdiTable->pfnPoolCreate = nullptr; pDdiTable->pfnPoolDestroy = nullptr; pDdiTable->pfnPoolDestroy = nullptr; - pDdiTable->pfnSharedAlloc = nullptr; + pDdiTable->pfnSharedAlloc = urUSMSharedAlloc; return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp new file mode 100644 index 0000000000000..66985fa46988e --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp @@ -0,0 +1,231 @@ +//===--------- usm.cpp - HIP Adapter ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include + +#include "common.hpp" +#include "context.hpp" +#include "device.hpp" +#include "platform.hpp" + +/// USM: Implements USM Host allocations using HIP Pinned Memory +/// +UR_APIEXPORT ur_result_t UR_APICALL +urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, + ur_usm_pool_handle_t pool, size_t size, void **ppMem) { + UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + size_t device_max_mem_alloc_size = 0; + UR_ASSERT(urDeviceGetInfo(hContext->get_device(), + UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t), + static_cast(&device_max_mem_alloc_size), + nullptr) == UR_RESULT_SUCCESS, + UR_RESULT_ERROR_INVALID_DEVICE); + UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size, + UR_RESULT_ERROR_INVALID_USM_SIZE); + + ur_result_t result = UR_RESULT_SUCCESS; + try { + ScopedContext active(hContext); + result = UR_CHECK_ERROR(hipHostMalloc(ppMem, size)); + } catch (ur_result_t error) { + result = error; + } + + UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || + ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); + + assert(result == UR_RESULT_SUCCESS && + (!pUSMDesc || pUSMDesc->align == 0 || + reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); + + return result; +} + +/// USM: Implements USM device allocations using a normal HIP device pointer +/// +UR_APIEXPORT ur_result_t UR_APICALL +urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, + size_t size, void **ppMem) { + UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + size_t device_max_mem_alloc_size = 0; + UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, + sizeof(size_t), + static_cast(&device_max_mem_alloc_size), + nullptr) == UR_RESULT_SUCCESS, + UR_RESULT_ERROR_INVALID_DEVICE); + UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size, + UR_RESULT_ERROR_INVALID_USM_SIZE); + + ur_result_t result = UR_RESULT_SUCCESS; + try { + ScopedContext active(hContext); + result = UR_CHECK_ERROR(hipMalloc(ppMem, size)); + } catch (ur_result_t error) { + result = error; + } + UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || + ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); + + assert(result == UR_RESULT_SUCCESS && + (!pUSMDesc || pUSMDesc->align == 0 || + reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); + + return result; +} + +/// USM: Implements USM Shared allocations using HIP Managed Memory +/// +UR_APIEXPORT ur_result_t UR_APICALL +urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, + size_t size, void **ppMem) { + UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + size_t device_max_mem_alloc_size = 0; + UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, + sizeof(size_t), + static_cast(&device_max_mem_alloc_size), + nullptr) == UR_RESULT_SUCCESS, + UR_RESULT_ERROR_INVALID_DEVICE); + UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size, + UR_RESULT_ERROR_INVALID_USM_SIZE); + + ur_result_t result = UR_RESULT_SUCCESS; + try { + ScopedContext active(hContext); + result = UR_CHECK_ERROR(hipMallocManaged(ppMem, size, hipMemAttachGlobal)); + } catch (ur_result_t error) { + result = error; + } + UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || + ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); + + assert(result == UR_RESULT_SUCCESS && + (!pUSMDesc || pUSMDesc->align == 0 || + reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); + + return result; +} + +/// USM: Frees the given USM pointer associated with the context. +/// +UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, + void *pMem) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + ur_result_t result = UR_RESULT_SUCCESS; + try { + ScopedContext active(hContext); + unsigned int type; + hipPointerAttribute_t hipPointerAttributeType; + result = + UR_CHECK_ERROR(hipPointerGetAttributes(&hipPointerAttributeType, pMem)); + type = hipPointerAttributeType.memoryType; + UR_ASSERT(type == hipMemoryTypeDevice || type == hipMemoryTypeHost, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + if (type == hipMemoryTypeDevice) { + result = UR_CHECK_ERROR(hipFree(pMem)); + } + if (type == hipMemoryTypeHost) { + result = UR_CHECK_ERROR(hipFreeHost(pMem)); + } + } catch (ur_result_t error) { + result = error; + } + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, + ur_usm_alloc_info_t propName, size_t propValueSize, + void *pPropValue, size_t *pPropValueSizeRet) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + ur_result_t result = UR_RESULT_SUCCESS; + hipPointerAttribute_t hipPointerAttributeType; + + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); + + try { + ScopedContext active(hContext); + switch (propName) { + case UR_USM_ALLOC_INFO_TYPE: { + unsigned int value; + // do not throw if hipPointerGetAttribute returns hipErrorInvalidValue + hipError_t ret = hipPointerGetAttributes(&hipPointerAttributeType, pMem); + if (ret == hipErrorInvalidValue) { + // pointer not known to the HIP subsystem + return ReturnValue(UR_USM_TYPE_UNKNOWN); + } + result = check_error_ur(ret, __func__, __LINE__ - 5, __FILE__); + value = hipPointerAttributeType.isManaged; + if (value) { + // pointer to managed memory + return ReturnValue(UR_USM_TYPE_SHARED); + } + result = UR_CHECK_ERROR( + hipPointerGetAttributes(&hipPointerAttributeType, pMem)); + value = hipPointerAttributeType.memoryType; + UR_ASSERT(value == hipMemoryTypeDevice || value == hipMemoryTypeHost, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + if (value == hipMemoryTypeDevice) { + // pointer to device memory + return ReturnValue(UR_USM_TYPE_DEVICE); + } + if (value == hipMemoryTypeHost) { + // pointer to host memory + return ReturnValue(UR_USM_TYPE_HOST); + } + // should never get here +#ifdef _MSC_VER + __assume(0); +#else + __builtin_unreachable(); +#endif + return ReturnValue(UR_USM_TYPE_UNKNOWN); + } + case UR_USM_ALLOC_INFO_BASE_PTR: + case UR_USM_ALLOC_INFO_SIZE: + return UR_RESULT_ERROR_INVALID_VALUE; + case UR_USM_ALLOC_INFO_DEVICE: { + // get device index associated with this pointer + result = UR_CHECK_ERROR( + hipPointerGetAttributes(&hipPointerAttributeType, pMem)); + + int device_idx = hipPointerAttributeType.device; + + // currently each device is in its own platform, so find the platform at + // the same index + std::vector platforms; + platforms.resize(device_idx + 1); + result = urPlatformGet(device_idx + 1, platforms.data(), nullptr); + + // get the device from the platform + ur_device_handle_t device = platforms[device_idx]->devices_[0].get(); + return ReturnValue(device); + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } catch (ur_result_t error) { + result = error; + } + return result; +} From d77d2c81eb881e1c335e267d864061b074abf53f Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Mon, 15 May 2023 17:00:25 +0100 Subject: [PATCH 06/42] [SYCL][PI][UR][HIP] Port HIP program and kernel to Unified Runtime --- sycl/plugins/hip/CMakeLists.txt | 4 + sycl/plugins/hip/pi_hip.cpp | 704 +----------------- sycl/plugins/hip/pi_hip.hpp | 199 +---- sycl/plugins/unified_runtime/CMakeLists.txt | 4 + .../ur/adapters/hip/kernel.cpp | 294 ++++++++ .../ur/adapters/hip/kernel.hpp | 178 +++++ .../ur/adapters/hip/program.cpp | 301 ++++++++ .../ur/adapters/hip/program.hpp | 45 ++ .../ur/adapters/hip/ur_interface_loader.cpp | 47 +- 9 files changed, 879 insertions(+), 897 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt index 2344ddcecff15..147ba1b9fac97 100644 --- a/sycl/plugins/hip/CMakeLists.txt +++ b/sycl/plugins/hip/CMakeLists.txt @@ -105,6 +105,10 @@ add_sycl_plugin(hip "../unified_runtime/ur/adapters/hip/sampler.cpp" "../unified_runtime/ur/adapters/hip/sampler.hpp" "../unified_runtime/ur/adapters/hip/usm.cpp" + "../unified_runtime/ur/adapters/hip/program.cpp" + "../unified_runtime/ur/adapters/hip/program.hpp" + "../unified_runtime/ur/adapters/hip/kernel.cpp" + "../unified_runtime/ur/adapters/hip/kernel.hpp" "../unified_runtime/ur/adapters/hip/ur_interface_loader.cpp" "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 2b16e90c04472..074e8c0e31221 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -625,68 +625,6 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { return PI_SUCCESS; } -_pi_program::_pi_program(pi_context ctxt) - : module_{nullptr}, binary_{}, - binarySizeInBytes_{0}, refCount_{1}, context_{ctxt} { - pi2ur::piContextRetain(context_); -} - -_pi_program::~_pi_program() { pi2ur::piContextRelease(context_); } - -pi_result _pi_program::set_binary(const char *source, size_t length) { - assert((binary_ == nullptr && binarySizeInBytes_ == 0) && - "Re-setting program binary data which has already been set"); - binary_ = source; - binarySizeInBytes_ = length; - return PI_SUCCESS; -} - -pi_result _pi_program::build_program(const char *build_options) { - - this->buildOptions_ = build_options; - - constexpr const unsigned int numberOfOptions = 4u; - - hipJitOption options[numberOfOptions]; - void *optionVals[numberOfOptions]; - - // Pass a buffer for info messages - options[0] = hipJitOptionInfoLogBuffer; - optionVals[0] = (void *)infoLog_; - // Pass the size of the info buffer - options[1] = hipJitOptionInfoLogBufferSizeBytes; - optionVals[1] = (void *)(long)MAX_LOG_SIZE; - // Pass a buffer for error message - options[2] = hipJitOptionErrorLogBuffer; - optionVals[2] = (void *)errorLog_; - // Pass the size of the error buffer - options[3] = hipJitOptionErrorLogBufferSizeBytes; - optionVals[3] = (void *)(long)MAX_LOG_SIZE; - - auto result = PI_CHECK_ERROR( - hipModuleLoadDataEx(&module_, static_cast(binary_), - numberOfOptions, options, optionVals)); - - const auto success = (result == PI_SUCCESS); - - buildStatus_ = - success ? PI_PROGRAM_BUILD_STATUS_SUCCESS : PI_PROGRAM_BUILD_STATUS_ERROR; - - // If no exception, result is correct - return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE; -} - -/// Finds kernel names by searching for entry points in the PTX source, as the -/// HIP driver API doesn't expose an operation for this. -/// Note: This is currently only being used by the SYCL program class for the -/// has_kernel method, so an alternative would be to move the has_kernel -/// query to PI and use hipModuleGetFunction to check for a kernel. -std::string getKernelNames(pi_program program) { - (void)program; - sycl::detail::pi::die("getKernelNames not implemented"); - return {}; -} - //-- PI API implementation extern "C" { @@ -1084,63 +1022,6 @@ pi_result hip_piEventsWait(pi_uint32 num_events, const pi_event *event_list) { } } -pi_result hip_piKernelCreate(pi_program program, const char *kernel_name, - pi_kernel *kernel) { - assert(kernel != nullptr); - assert(program != nullptr); - - pi_result retErr = PI_SUCCESS; - std::unique_ptr<_pi_kernel> retKernel{nullptr}; - - try { - ScopedContext active(program->get_context()); - - hipFunction_t hipFunc; - retErr = PI_CHECK_ERROR( - hipModuleGetFunction(&hipFunc, program->get(), kernel_name)); - - std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset"; - hipFunction_t hipFuncWithOffsetParam; - hipError_t offsetRes = hipModuleGetFunction( - &hipFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str()); - - // If there is no kernel with global offset parameter we mark it as missing - if (offsetRes == hipErrorNotFound) { - hipFuncWithOffsetParam = nullptr; - } else { - retErr = PI_CHECK_ERROR(offsetRes); - } - - retKernel = std::unique_ptr<_pi_kernel>( - new _pi_kernel{hipFunc, hipFuncWithOffsetParam, kernel_name, program, - program->get_context()}); - } catch (pi_result err) { - retErr = err; - } catch (...) { - retErr = PI_ERROR_OUT_OF_HOST_MEMORY; - } - - *kernel = retKernel.release(); - return retErr; -} - -pi_result hip_piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, - size_t arg_size, const void *arg_value) { - - assert(kernel != nullptr); - pi_result retErr = PI_SUCCESS; - try { - if (arg_value) { - kernel->set_kernel_arg(arg_index, arg_size, arg_value); - } else { - kernel->set_kernel_local_arg(arg_index, arg_size); - } - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - pi_result hip_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem *arg_value) { @@ -1378,545 +1259,6 @@ hip_piEnqueueNativeKernel(pi_queue queue, void (*user_func)(void *), void *args, return {}; } -/// Not used as HIP backend only creates programs from binary. -/// See \ref hip_piclProgramCreateWithBinary. -/// -pi_result hip_piclProgramCreateWithSource(pi_context context, pi_uint32 count, - const char **strings, - const size_t *lengths, - pi_program *program) { - (void)context; - (void)count; - (void)strings; - (void)lengths; - (void)program; - - sycl::detail::pi::hipPrint("hip_piclProgramCreateWithSource not implemented"); - return PI_ERROR_INVALID_OPERATION; -} - -/// Loads the images from a PI program into a HIPmodule that can be -/// used later on to extract functions (kernels). -/// See \ref _pi_program for implementation details. -/// -pi_result hip_piProgramBuild( - pi_program program, [[maybe_unused]] pi_uint32 num_devices, - [[maybe_unused]] const pi_device *device_list, const char *options, - [[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data), - [[maybe_unused]] void *user_data) { - - assert(program != nullptr); - assert(num_devices == 1 || num_devices == 0); - assert(device_list != nullptr || num_devices == 0); - assert(pfn_notify == nullptr); - assert(user_data == nullptr); - pi_result retError = PI_SUCCESS; - - try { - ScopedContext active(program->get_context()); - - program->build_program(options); - - } catch (pi_result err) { - retError = err; - } - return retError; -} - -/// \TODO Not implemented -pi_result hip_piProgramCreate(pi_context context, const void *il, size_t length, - pi_program *res_program) { - (void)context; - (void)il; - (void)length; - (void)res_program; - - sycl::detail::pi::die("hip_piProgramCreate not implemented"); - return {}; -} - -/// Loads images from a list of PTX or HIPBIN binaries. -/// Note: No calls to HIP driver API in this function, only store binaries -/// for later. -/// -/// Note: Only supports one device -/// -pi_result hip_piProgramCreateWithBinary( - pi_context context, [[maybe_unused]] pi_uint32 num_devices, - [[maybe_unused]] const pi_device *device_list, const size_t *lengths, - const unsigned char **binaries, size_t num_metadata_entries, - const pi_device_binary_property *metadata, pi_int32 *binary_status, - pi_program *program) { - (void)num_metadata_entries; - (void)metadata; - (void)binary_status; - - assert(context != nullptr); - assert(binaries != nullptr); - assert(program != nullptr); - assert(device_list != nullptr); - assert(num_devices == 1 && "HIP contexts are for a single device"); - assert((context->get_device()->get() == device_list[0]->get()) && - "Mismatch between devices context and passed context when creating " - "program from binary"); - - pi_result retError = PI_SUCCESS; - - std::unique_ptr<_pi_program> retProgram{new _pi_program{context}}; - - // TODO: Set metadata here and use reqd_work_group_size information. - // See cuda_piProgramCreateWithBinary - - const bool has_length = (lengths != nullptr); - size_t length = has_length - ? lengths[0] - : strlen(reinterpret_cast(binaries[0])) + 1; - - assert(length != 0); - - retProgram->set_binary(reinterpret_cast(binaries[0]), length); - - *program = retProgram.release(); - - return retError; -} - -pi_result hip_piProgramGetInfo(pi_program program, pi_program_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - assert(program != nullptr); - - switch (param_name) { - case PI_PROGRAM_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, param_value_size_ret, - program->get_reference_count()); - case PI_PROGRAM_INFO_CONTEXT: - return getInfo(param_value_size, param_value, param_value_size_ret, - program->context_); - case PI_PROGRAM_INFO_NUM_DEVICES: - return getInfo(param_value_size, param_value, param_value_size_ret, 1u); - case PI_PROGRAM_INFO_DEVICES: - return getInfoArray(1, param_value_size, param_value, param_value_size_ret, - &program->context_->deviceId_); - case PI_PROGRAM_INFO_SOURCE: - return getInfo(param_value_size, param_value, param_value_size_ret, - program->binary_); - case PI_PROGRAM_INFO_BINARY_SIZES: - return getInfoArray(1, param_value_size, param_value, param_value_size_ret, - &program->binarySizeInBytes_); - case PI_PROGRAM_INFO_BINARIES: - return getInfoArray(1, param_value_size, param_value, param_value_size_ret, - &program->binary_); - case PI_PROGRAM_INFO_KERNEL_NAMES: { - return getInfo(param_value_size, param_value, param_value_size_ret, - getKernelNames(program).c_str()); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - sycl::detail::pi::die("Program info request not implemented"); - return {}; -} - -pi_result hip_piProgramLink(pi_context context, pi_uint32 num_devices, - const pi_device *device_list, const char *options, - pi_uint32 num_input_programs, - const pi_program *input_programs, - void (*pfn_notify)(pi_program program, - void *user_data), - void *user_data, pi_program *ret_program) { - (void)context; - (void)num_devices; - (void)device_list; - (void)options; - (void)num_input_programs; - (void)input_programs; - (void)pfn_notify; - (void)user_data; - (void)ret_program; - sycl::detail::pi::die( - "hip_piProgramLink: linking not supported with hip backend"); - return {}; -} - -/// Creates a new program that is the outcome of the compilation of the headers -/// and the program. -/// \TODO Implement asynchronous compilation -/// -pi_result hip_piProgramCompile( - pi_program program, [[maybe_unused]] pi_uint32 num_devices, - [[maybe_unused]] const pi_device *device_list, const char *options, - [[maybe_unused]] pi_uint32 num_input_headers, - const pi_program *input_headers, const char **header_include_names, - [[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data), - [[maybe_unused]] void *user_data) { - (void)input_headers; - (void)header_include_names; - - assert(program != nullptr); - assert(num_devices == 1 || num_devices == 0); - assert(device_list != nullptr || num_devices == 0); - assert(pfn_notify == nullptr); - assert(user_data == nullptr); - assert(num_input_headers == 0); - pi_result retError = PI_SUCCESS; - - try { - ScopedContext active(program->get_context()); - - program->build_program(options); - - } catch (pi_result err) { - retError = err; - } - return retError; -} - -pi_result hip_piProgramGetBuildInfo(pi_program program, pi_device device, - pi_program_build_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - (void)device; - - assert(program != nullptr); - - switch (param_name) { - case PI_PROGRAM_BUILD_INFO_STATUS: { - return getInfo(param_value_size, param_value, param_value_size_ret, - program->buildStatus_); - } - case PI_PROGRAM_BUILD_INFO_OPTIONS: - return getInfo(param_value_size, param_value, param_value_size_ret, - program->buildOptions_.c_str()); - case PI_PROGRAM_BUILD_INFO_LOG: - return getInfoArray(program->MAX_LOG_SIZE, param_value_size, param_value, - param_value_size_ret, program->infoLog_); - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - sycl::detail::pi::die("Program Build info request not implemented"); - return {}; -} - -pi_result hip_piProgramRetain(pi_program program) { - assert(program != nullptr); - assert(program->get_reference_count() > 0); - program->increment_reference_count(); - return PI_SUCCESS; -} - -/// Decreases the reference count of a pi_program object. -/// When the reference count reaches 0, it unloads the module from -/// the context. -pi_result hip_piProgramRelease(pi_program program) { - assert(program != nullptr); - - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. - assert(program->get_reference_count() != 0 && - "Reference count overflow detected in hip_piProgramRelease."); - - // decrement ref count. If it is 0, delete the program. - if (program->decrement_reference_count() == 0) { - - std::unique_ptr<_pi_program> program_ptr{program}; - - pi_result result = PI_ERROR_INVALID_PROGRAM; - - try { - ScopedContext active(program->get_context()); - auto hipModule = program->get(); - result = PI_CHECK_ERROR(hipModuleUnload(hipModule)); - } catch (...) { - result = PI_ERROR_OUT_OF_RESOURCES; - } - - return result; - } - - return PI_SUCCESS; -} - -/// Gets the native HIP handle of a PI program object -/// -/// \param[in] program The PI program to get the native HIP object of. -/// \param[out] nativeHandle Set to the native handle of the PI program object. -/// -/// \return TBD -pi_result hip_piextProgramGetNativeHandle(pi_program program, - pi_native_handle *nativeHandle) { - *nativeHandle = reinterpret_cast(program->get()); - return PI_SUCCESS; -} - -/// Created a PI program object from a HIP program handle. -/// TODO: Implement this. -/// NOTE: The created PI object takes ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI program object from. -/// \param[in] context The PI context of the program. -/// \param[in] ownNativeHandle tells if should assume the ownership of -/// the native handle. -/// \param[out] program Set to the PI program object created from native handle. -/// -/// \return TBD -pi_result hip_piextProgramCreateWithNativeHandle(pi_native_handle nativeHandle, - pi_context context, - bool ownNativeHandle, - pi_program *program) { - (void)nativeHandle; - (void)context; - (void)ownNativeHandle; - (void)program; - - sycl::detail::pi::die( - "Creation of PI program from native handle not implemented"); - return {}; -} - -pi_result hip_piKernelGetInfo(pi_kernel kernel, pi_kernel_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - - if (kernel != nullptr) { - - switch (param_name) { - case PI_KERNEL_INFO_FUNCTION_NAME: - return getInfo(param_value_size, param_value, param_value_size_ret, - kernel->get_name()); - case PI_KERNEL_INFO_NUM_ARGS: - return getInfo(param_value_size, param_value, param_value_size_ret, - kernel->get_num_args()); - case PI_KERNEL_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, param_value_size_ret, - kernel->get_reference_count()); - case PI_KERNEL_INFO_CONTEXT: { - return getInfo(param_value_size, param_value, param_value_size_ret, - kernel->get_context()); - } - case PI_KERNEL_INFO_PROGRAM: { - return getInfo(param_value_size, param_value, param_value_size_ret, - kernel->get_program()); - } - case PI_KERNEL_INFO_ATTRIBUTES: { - return getInfo(param_value_size, param_value, param_value_size_ret, ""); - } - default: { - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - } - } - - return PI_ERROR_INVALID_KERNEL; -} - -pi_result hip_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, - pi_kernel_group_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - - // here we want to query about a kernel's hip blocks! - - if (kernel != nullptr) { - - switch (param_name) { - case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { - size_t global_work_size[3] = {0, 0, 0}; - - int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_block_dimX, hipDeviceAttributeMaxBlockDimX, - device->get()) == hipSuccess); - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_block_dimY, hipDeviceAttributeMaxBlockDimY, - device->get()) == hipSuccess); - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_block_dimZ, hipDeviceAttributeMaxBlockDimZ, - device->get()) == hipSuccess); - - int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_grid_dimX, hipDeviceAttributeMaxGridDimX, - device->get()) == hipSuccess); - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_grid_dimY, hipDeviceAttributeMaxGridDimY, - device->get()) == hipSuccess); - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_grid_dimZ, hipDeviceAttributeMaxGridDimZ, - device->get()) == hipSuccess); - - global_work_size[0] = max_block_dimX * max_grid_dimX; - global_work_size[1] = max_block_dimY * max_grid_dimY; - global_work_size[2] = max_block_dimZ * max_grid_dimZ; - return getInfoArray(3, param_value_size, param_value, - param_value_size_ret, global_work_size); - } - case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - int max_threads = 0; - sycl::detail::pi::assertion( - hipFuncGetAttribute(&max_threads, - HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - kernel->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t(max_threads)); - } - case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { - // Returns the work-group size specified in the kernel source or IL. - // If the work-group size is not specified in the kernel source or IL, - // (0, 0, 0) is returned. - // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html - - // TODO: can we extract the work group size from the PTX? - size_t group_size[3] = {0, 0, 0}; - return getInfoArray(3, param_value_size, param_value, - param_value_size_ret, group_size); - } - case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { - // OpenCL LOCAL == HIP SHARED - int bytes = 0; - sycl::detail::pi::assertion( - hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - kernel->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { - // Work groups should be multiples of the warp size - int warpSize = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, - device->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(warpSize)); - } - case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { - // OpenCL PRIVATE == HIP LOCAL - int bytes = 0; - sycl::detail::pi::assertion( - hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, - kernel->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_NUM_REGS: { - sycl::detail::pi::die("PI_KERNEL_GROUP_INFO_NUM_REGS in " - "piKernelGetGroupInfo not implemented\n"); - return {}; - } - - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - } - - return PI_ERROR_INVALID_KERNEL; -} - -pi_result hip_piKernelGetSubGroupInfo( - pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name, - size_t input_value_size, const void *input_value, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) { - (void)input_value_size; - (void)input_value; - - if (kernel != nullptr) { - switch (param_name) { - case PI_KERNEL_MAX_SUB_GROUP_SIZE: { - // Sub-group size is equivalent to warp size - int warpSize = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, - device->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(warpSize)); - } - case PI_KERNEL_MAX_NUM_SUB_GROUPS: { - // Number of sub-groups = max block size / warp size + possible remainder - int max_threads = 0; - sycl::detail::pi::assertion( - hipFuncGetAttribute(&max_threads, - HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - kernel->get()) == hipSuccess); - int warpSize = 0; - hip_piKernelGetSubGroupInfo(kernel, device, PI_KERNEL_MAX_SUB_GROUP_SIZE, - 0, nullptr, sizeof(uint32_t), &warpSize, - nullptr); - int maxWarps = (max_threads + warpSize - 1) / warpSize; - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(maxWarps)); - } - case PI_KERNEL_COMPILE_NUM_SUB_GROUPS: { - // Return value of 0 => not specified - // TODO: Revisit if PTX is generated for compile-time work-group sizes - return getInfo(param_value_size, param_value, param_value_size_ret, 0); - } - case PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: { - // Return value of 0 => unspecified or "auto" sub-group size - // Correct for now, since warp size may be read from special register - // TODO: Return warp size once default is primary sub-group size - // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX - return getInfo(param_value_size, param_value, param_value_size_ret, 0); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - } - return PI_ERROR_INVALID_KERNEL; -} - -pi_result hip_piKernelRetain(pi_kernel kernel) { - assert(kernel != nullptr); - assert(kernel->get_reference_count() > 0u); - - kernel->increment_reference_count(); - return PI_SUCCESS; -} - -pi_result hip_piKernelRelease(pi_kernel kernel) { - assert(kernel != nullptr); - - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. - assert(kernel->get_reference_count() != 0 && - "Reference count overflow detected in hip_piKernelRelease."); - - // decrement ref count. If it is 0, delete the program. - if (kernel->decrement_reference_count() == 0) { - // no internal hip resources to clean up. Just delete it. - delete kernel; - return PI_SUCCESS; - } - - return PI_SUCCESS; -} - -// A NOP for the HIP backend -pi_result hip_piKernelSetExecInfo(pi_kernel kernel, - pi_kernel_exec_info param_name, - size_t param_value_size, - const void *param_value) { - (void)kernel; - (void)param_name; - (void)param_value_size; - (void)param_value; - - return PI_SUCCESS; -} - -pi_result hip_piextProgramSetSpecializationConstant(pi_program, pi_uint32, - size_t, const void *) { - // This entry point is only used for native specialization constants (SPIR-V), - // and the HIP plugin is AOT only so this entry point is not supported. - sycl::detail::pi::die("Native specialization constants are not supported"); - return {}; -} - -pi_result hip_piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, - size_t arg_size, const void *arg_value) { - kernel->set_kernel_arg(arg_index, arg_size, arg_value); - return PI_SUCCESS; -} - // // Events // @@ -3392,31 +2734,31 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextMemGetNativeHandle, pi2ur::piextMemGetNativeHandle) _PI_CL(piextMemCreateWithNativeHandle, pi2ur::piextMemCreateWithNativeHandle) // Program - _PI_CL(piProgramCreate, hip_piProgramCreate) - _PI_CL(piclProgramCreateWithSource, hip_piclProgramCreateWithSource) - _PI_CL(piProgramCreateWithBinary, hip_piProgramCreateWithBinary) - _PI_CL(piProgramGetInfo, hip_piProgramGetInfo) - _PI_CL(piProgramCompile, hip_piProgramCompile) - _PI_CL(piProgramBuild, hip_piProgramBuild) - _PI_CL(piProgramLink, hip_piProgramLink) - _PI_CL(piProgramGetBuildInfo, hip_piProgramGetBuildInfo) - _PI_CL(piProgramRetain, hip_piProgramRetain) - _PI_CL(piProgramRelease, hip_piProgramRelease) - _PI_CL(piextProgramGetNativeHandle, hip_piextProgramGetNativeHandle) + _PI_CL(piProgramCreate, pi2ur::piProgramCreate) + _PI_CL(piclProgramCreateWithSource, pi2ur::piclProgramCreateWithSource) + _PI_CL(piProgramCreateWithBinary, pi2ur::piProgramCreateWithBinary) + _PI_CL(piProgramGetInfo, pi2ur::piProgramGetInfo) + _PI_CL(piProgramCompile, pi2ur::piProgramCompile) + _PI_CL(piProgramBuild, pi2ur::piProgramBuild) + _PI_CL(piProgramLink, pi2ur::piProgramLink) + _PI_CL(piProgramGetBuildInfo, pi2ur::piProgramGetBuildInfo) + _PI_CL(piProgramRetain, pi2ur::piProgramRetain) + _PI_CL(piProgramRelease, pi2ur::piProgramRelease) + _PI_CL(piextProgramGetNativeHandle, pi2ur::piextProgramGetNativeHandle) _PI_CL(piextProgramCreateWithNativeHandle, - hip_piextProgramCreateWithNativeHandle) - // Kernel - _PI_CL(piKernelCreate, hip_piKernelCreate) - _PI_CL(piKernelSetArg, hip_piKernelSetArg) - _PI_CL(piKernelGetInfo, hip_piKernelGetInfo) - _PI_CL(piKernelGetGroupInfo, hip_piKernelGetGroupInfo) - _PI_CL(piKernelGetSubGroupInfo, hip_piKernelGetSubGroupInfo) - _PI_CL(piKernelRetain, hip_piKernelRetain) - _PI_CL(piKernelRelease, hip_piKernelRelease) - _PI_CL(piKernelSetExecInfo, hip_piKernelSetExecInfo) + pi2ur::piextProgramCreateWithNativeHandle) _PI_CL(piextProgramSetSpecializationConstant, - hip_piextProgramSetSpecializationConstant) - _PI_CL(piextKernelSetArgPointer, hip_piextKernelSetArgPointer) + pi2ur::piextProgramSetSpecializationConstant) + // Kernel + _PI_CL(piKernelCreate, pi2ur::piKernelCreate) + _PI_CL(piKernelSetArg, pi2ur::piKernelSetArg) + _PI_CL(piKernelGetInfo, pi2ur::piKernelGetInfo) + _PI_CL(piKernelGetGroupInfo, pi2ur::piKernelGetGroupInfo) + _PI_CL(piKernelGetSubGroupInfo, pi2ur::piKernelGetSubGroupInfo) + _PI_CL(piKernelRetain, pi2ur::piKernelRetain) + _PI_CL(piKernelRelease, pi2ur::piKernelRelease) + _PI_CL(piKernelSetExecInfo, pi2ur::piKernelSetExecInfo) + _PI_CL(piextKernelSetArgPointer, pi2ur::piKernelSetArgPointer) // Event _PI_CL(piEventCreate, hip_piEventCreate) _PI_CL(piEventGetInfo, hip_piEventGetInfo) diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index 7d8f8adcf6bd9..f8a638327f103 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -41,21 +41,19 @@ #include #include +#include #include #include #include +#include #include "pi2ur.hpp" extern "C" { /// \cond INGORE_BLOCK_IN_DOXYGEN -pi_result hip_piProgramRetain(pi_program); -pi_result hip_piProgramRelease(pi_program); pi_result hip_piQueueRelease(pi_queue); pi_result hip_piQueueRetain(pi_queue); -pi_result hip_piKernelRetain(pi_kernel); -pi_result hip_piKernelRelease(pi_kernel); /// \endcond } @@ -482,36 +480,8 @@ struct _pi_event { /// Implementation of PI Program on HIP Module object /// -struct _pi_program { - using native_type = hipModule_t; - native_type module_; - const char *binary_; - size_t binarySizeInBytes_; - std::atomic_uint32_t refCount_; - _pi_context *context_; - - constexpr static size_t MAX_LOG_SIZE = 8192u; - - char errorLog_[MAX_LOG_SIZE], infoLog_[MAX_LOG_SIZE]; - std::string buildOptions_; - pi_program_build_status buildStatus_ = PI_PROGRAM_BUILD_STATUS_NONE; - - _pi_program(pi_context ctxt); - ~_pi_program(); - - pi_result set_binary(const char *binary, size_t binarySizeInBytes); - - pi_result build_program(const char *build_options); - - pi_context get_context() const { return context_; }; - - native_type get() const noexcept { return module_; }; - - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } +struct _pi_program : ur_program_handle_t_ { + using ur_program_handle_t_::ur_program_handle_t_; }; /// Implementation of a PI Kernel for HIP @@ -530,165 +500,8 @@ struct _pi_program { /// HIP shared model. This object simply calculates the total of /// shared memory, and the initial offsets of each parameter. /// -struct _pi_kernel { - using native_type = hipFunction_t; - - native_type function_; - native_type functionWithOffsetParam_; - std::string name_; - pi_context context_; - pi_program program_; - std::atomic_uint32_t refCount_; - - /// Structure that holds the arguments to the kernel. - /// Note earch argument size is known, since it comes - /// from the kernel signature. - /// This is not something can be queried from the HIP API - /// so there is a hard-coded size (\ref MAX_PARAM_BYTES) - /// and a storage. - /// - struct arguments { - static constexpr size_t MAX_PARAM_BYTES = 4000u; - using args_t = std::array; - using args_size_t = std::vector; - using args_index_t = std::vector; - args_t storage_; - args_size_t paramSizes_; - args_index_t indices_; - args_size_t offsetPerIndex_; - - std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0}; - - arguments() { - // Place the implicit offset index at the end of the indicies collection - indices_.emplace_back(&implicitOffsetArgs_); - } - - /// Adds an argument to the kernel. - /// If the argument existed before, it is replaced. - /// Otherwise, it is added. - /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. - void add_arg(size_t index, size_t size, const void *arg, - size_t localSize = 0) { - if (index + 2 > indices_.size()) { - // Move implicit offset argument index with the end - indices_.resize(index + 2, indices_.back()); - // Ensure enough space for the new argument - paramSizes_.resize(index + 1); - offsetPerIndex_.resize(index + 1); - } - paramSizes_[index] = size; - // calculate the insertion point on the array - size_t insertPos = std::accumulate(std::begin(paramSizes_), - std::begin(paramSizes_) + index, 0); - // Update the stored value for the argument - std::memcpy(&storage_[insertPos], arg, size); - indices_[index] = &storage_[insertPos]; - offsetPerIndex_[index] = localSize; - } - - void add_local_arg(size_t index, size_t size) { - size_t localOffset = this->get_local_size(); - - // maximum required alignment is the size of the largest vector type - const size_t max_alignment = sizeof(double) * 16; - - // for arguments smaller than the maximum alignment simply align to the - // size of the argument - const size_t alignment = std::min(max_alignment, size); - - // align the argument - size_t alignedLocalOffset = localOffset; - if (localOffset % alignment != 0) { - alignedLocalOffset += alignment - (localOffset % alignment); - } - - add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset), - size + (alignedLocalOffset - localOffset)); - } - - void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) { - assert(size == sizeof(std::uint32_t) * 3); - std::memcpy(implicitOffsetArgs_, implicitOffset, size); - } - - void clear_local_size() { - std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0); - } - - args_index_t get_indices() const noexcept { return indices_; } - - pi_uint32 get_local_size() const { - return std::accumulate(std::begin(offsetPerIndex_), - std::end(offsetPerIndex_), 0); - } - } args_; - - _pi_kernel(hipFunction_t func, hipFunction_t funcWithOffsetParam, - const char *name, pi_program program, pi_context ctxt) - : function_{func}, functionWithOffsetParam_{funcWithOffsetParam}, - name_{name}, context_{ctxt}, program_{program}, refCount_{1} { - hip_piProgramRetain(program_); - pi2ur::piContextRetain(context_); - } - - _pi_kernel(hipFunction_t func, const char *name, pi_program program, - pi_context ctxt) - : _pi_kernel{func, nullptr, name, program, ctxt} {} - - ~_pi_kernel() { - hip_piProgramRelease(program_); - pi2ur::piContextRelease(context_); - } - - pi_program get_program() const noexcept { return program_; } - - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } - - native_type get() const noexcept { return function_; }; - - native_type get_with_offset_parameter() const noexcept { - return functionWithOffsetParam_; - }; - - bool has_with_offset_parameter() const noexcept { - return functionWithOffsetParam_ != nullptr; - } - - pi_context get_context() const noexcept { return context_; }; - - const char *get_name() const noexcept { return name_.c_str(); } - - /// Returns the number of arguments, excluding the implicit global offset. - /// Note this only returns the current known number of arguments, not the - /// real one required by the kernel, since this cannot be queried from - /// the HIP Driver API - pi_uint32 get_num_args() const noexcept { return args_.indices_.size() - 1; } - - void set_kernel_arg(int index, size_t size, const void *arg) { - args_.add_arg(index, size, arg); - } - - void set_kernel_local_arg(int index, size_t size) { - args_.add_local_arg(index, size); - } - - void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) { - args_.set_implicit_offset(size, implicitOffset); - } - - arguments::args_index_t get_arg_indices() const { - return args_.get_indices(); - } - - pi_uint32 get_local_size() const noexcept { return args_.get_local_size(); } - - void clear_local_size() { args_.clear_local_size(); } +struct _pi_kernel : ur_kernel_handle_t_ { + using ur_kernel_handle_t_::ur_kernel_handle_t_; }; /// Implementation of samplers for HIP diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 2d2673978c927..520e2bd2f86af 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -190,6 +190,10 @@ if ("hip" IN_LIST SYCL_ENABLE_PLUGINS) "ur/adapters/hip/sampler.cpp" "ur/adapters/hip/sampler.hpp" "ur/adapters/hip/usm.cpp" + "ur/adapters/hip/program.cpp" + "ur/adapters/hip/program.hpp" + "ur/adapters/hip/kernel.cpp" + "ur/adapters/hip/kernel.hpp" "ur/adapters/hip/ur_interface_loader.cpp" INCLUDE_DIRS ${sycl_inc_dir} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp new file mode 100644 index 0000000000000..40dba4a782d75 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp @@ -0,0 +1,294 @@ +//===--------- kernel.cpp - HIP Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "kernel.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, + ur_kernel_handle_t *phKernel) { + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pKernelName, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(phKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + ur_result_t retErr = UR_RESULT_SUCCESS; + std::unique_ptr retKernel{nullptr}; + + try { + ScopedContext active(hProgram->get_context()); + + hipFunction_t hipFunc; + retErr = UR_CHECK_ERROR( + hipModuleGetFunction(&hipFunc, hProgram->get(), pKernelName)); + + std::string kernel_name_woffset = std::string(pKernelName) + "_with_offset"; + hipFunction_t hipFuncWithOffsetParam; + hipError_t offsetRes = hipModuleGetFunction( + &hipFuncWithOffsetParam, hProgram->get(), kernel_name_woffset.c_str()); + + // If there is no kernel with global offset parameter we mark it as missing + if (offsetRes == hipErrorNotFound) { + hipFuncWithOffsetParam = nullptr; + } else { + retErr = UR_CHECK_ERROR(offsetRes); + } + retKernel = std::unique_ptr( + new ur_kernel_handle_t_{hipFunc, hipFuncWithOffsetParam, pKernelName, + hProgram, hProgram->get_context()}); + } catch (ur_result_t err) { + retErr = err; + } catch (...) { + retErr = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + *phKernel = retKernel.release(); + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + ur_kernel_group_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + // Here we want to query about a kernel's cuda blocks! + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + size_t global_work_size[3] = {0, 0, 0}; + + int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_block_dimX, hipDeviceAttributeMaxBlockDimX, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_block_dimY, hipDeviceAttributeMaxBlockDimY, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_block_dimZ, hipDeviceAttributeMaxBlockDimZ, + hDevice->get()) == hipSuccess); + + int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_grid_dimX, hipDeviceAttributeMaxGridDimX, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_grid_dimY, hipDeviceAttributeMaxGridDimY, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&max_grid_dimZ, hipDeviceAttributeMaxGridDimZ, + hDevice->get()) == hipSuccess); + + global_work_size[0] = max_block_dimX * max_grid_dimX; + global_work_size[1] = max_block_dimY * max_grid_dimY; + global_work_size[2] = max_block_dimZ * max_grid_dimZ; + return ReturnValue(global_work_size, 3); + } + case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + int max_threads = 0; + sycl::detail::ur::assertion( + hipFuncGetAttribute(&max_threads, + HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + hKernel->get()) == hipSuccess); + return ReturnValue(size_t(max_threads)); + } + case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { + size_t group_size[3] = {0, 0, 0}; + // Returns the work-group size specified in the kernel source or IL. + // If the work-group size is not specified in the kernel source or IL, + // (0, 0, 0) is returned. + // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html + + // TODO: can we extract the work group size from the PTX? + return ReturnValue(group_size, 3); + } + case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { + // OpenCL LOCAL == HIP SHARED + int bytes = 0; + sycl::detail::ur::assertion( + hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + hKernel->get()) == hipSuccess); + return ReturnValue(uint64_t(bytes)); + } + case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + // Work groups should be multiples of the warp size + int warpSize = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, + hDevice->get()) == hipSuccess); + return ReturnValue(static_cast(warpSize)); + } + case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { + // OpenCL PRIVATE == HIP LOCAL + int bytes = 0; + sycl::detail::ur::assertion( + hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, + hKernel->get()) == hipSuccess); + return ReturnValue(uint64_t(bytes)); + } + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hKernel->get_reference_count() > 0u, + UR_RESULT_ERROR_INVALID_KERNEL); + + hKernel->increment_reference_count(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelRelease(ur_kernel_handle_t hKernel) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + UR_ASSERT(hKernel->get_reference_count() != 0, + UR_RESULT_ERROR_INVALID_KERNEL); + + // decrement ref count. If it is 0, delete the program. + if (hKernel->decrement_reference_count() == 0) { + // no internal cuda resources to clean up. Just delete it. + delete hKernel; + return UR_RESULT_SUCCESS; + } + + return UR_RESULT_SUCCESS; +} + +// TODO(ur): Not implemented on hip atm. Also, need to add tests for this +// feature. +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( + ur_kernel_handle_t hKernel, ur_native_handle_t *phNativeKernel) { + (void)hKernel; + (void)phNativeKernel; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetArgValue(ur_kernel_handle_t hKernel, uint32_t argIndex, + size_t argSize, const void *pArgValue) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + try { + if (pArgValue) { + hKernel->set_kernel_arg(argIndex, argSize, pArgValue); + } else { + hKernel->set_kernel_local_arg(argIndex, argSize); + } + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, + ur_kernel_info_t propName, + size_t propSize, + void *pKernelInfo, + size_t *pPropSizeRet) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + UrReturnHelper ReturnValue(propSize, pKernelInfo, pPropSizeRet); + + switch (propName) { + case UR_KERNEL_INFO_FUNCTION_NAME: + return ReturnValue(hKernel->get_name()); + case UR_KERNEL_INFO_NUM_ARGS: + return ReturnValue(hKernel->get_num_args()); + case UR_KERNEL_INFO_REFERENCE_COUNT: + return ReturnValue(hKernel->get_reference_count()); + case UR_KERNEL_INFO_CONTEXT: + return ReturnValue(hKernel->get_context()); + case UR_KERNEL_INFO_PROGRAM: + return ReturnValue(hKernel->get_program()); + case UR_KERNEL_INFO_ATTRIBUTES: + return ReturnValue(""); + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + ur_kernel_sub_group_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + switch (propName) { + case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: { + // Sub-group size is equivalent to warp size + int warpSize = 0; + sycl::detail::ur::assertion( + hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, + hDevice->get()) == hipSuccess); + return ReturnValue(static_cast(warpSize)); + } + case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: { + // Number of sub-groups = max block size / warp size + possible remainder + int max_threads = 0; + sycl::detail::ur::assertion( + hipFuncGetAttribute(&max_threads, + HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + hKernel->get()) == hipSuccess); + int warpSize = 0; + urKernelGetSubGroupInfo(hKernel, hDevice, + UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE, + sizeof(uint32_t), &warpSize, nullptr); + int maxWarps = (max_threads + warpSize - 1) / warpSize; + return ReturnValue(static_cast(maxWarps)); + } + case UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS: { + // Return value of 0 => not specified + // TODO: Revisit if PTX is generated for compile-time work-group sizes + return ReturnValue(0); + } + case UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL: { + // Return value of 0 => unspecified or "auto" sub-group size + // Correct for now, since warp size may be read from special register + // TODO: Return warp size once default is primary sub-group size + // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX + return ReturnValue(0); + } + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( + ur_kernel_handle_t hKernel, uint32_t argIndex, const void *pArgValue) { + hKernel->set_kernel_arg(argIndex, sizeof(pArgValue), pArgValue); + return UR_RESULT_SUCCESS; +} + +// A NOP for the HIP backend +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, + size_t propSize, const void *pPropValue) { + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( + ur_native_handle_t hNativeKernel, ur_context_handle_t hContext, + ur_program_handle_t hProgram, + const ur_kernel_native_properties_t *pProperties, + ur_kernel_handle_t *phKernel) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp new file mode 100644 index 0000000000000..2fdc79da2de29 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp @@ -0,0 +1,178 @@ +//===--------- kernel.hpp - HIP Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include + +#include +#include +#include + +#include "program.hpp" + +struct ur_kernel_handle_t_ : _ur_object { + using native_type = hipFunction_t; + + native_type function_; + native_type functionWithOffsetParam_; + std::string name_; + ur_context_handle_t context_; + ur_program_handle_t program_; + std::atomic_uint32_t refCount_; + + /// Structure that holds the arguments to the kernel. + /// Note earch argument size is known, since it comes + /// from the kernel signature. + /// This is not something can be queried from the HIP API + /// so there is a hard-coded size (\ref MAX_PARAM_BYTES) + /// and a storage. + /// + struct arguments { + static constexpr size_t MAX_PARAM_BYTES = 4000u; + using args_t = std::array; + using args_size_t = std::vector; + using args_index_t = std::vector; + args_t storage_; + args_size_t paramSizes_; + args_index_t indices_; + args_size_t offsetPerIndex_; + + std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0}; + + arguments() { + // Place the implicit offset index at the end of the indicies collection + indices_.emplace_back(&implicitOffsetArgs_); + } + + /// Adds an argument to the kernel. + /// If the argument existed before, it is replaced. + /// Otherwise, it is added. + /// Gaps are filled with empty arguments. + /// Implicit offset argument is kept at the back of the indices collection. + void add_arg(size_t index, size_t size, const void *arg, + size_t localSize = 0) { + if (index + 2 > indices_.size()) { + // Move implicit offset argument index with the end + indices_.resize(index + 2, indices_.back()); + // Ensure enough space for the new argument + paramSizes_.resize(index + 1); + offsetPerIndex_.resize(index + 1); + } + paramSizes_[index] = size; + // calculate the insertion point on the array + size_t insertPos = std::accumulate(std::begin(paramSizes_), + std::begin(paramSizes_) + index, 0); + // Update the stored value for the argument + std::memcpy(&storage_[insertPos], arg, size); + indices_[index] = &storage_[insertPos]; + offsetPerIndex_[index] = localSize; + } + + void add_local_arg(size_t index, size_t size) { + size_t localOffset = this->get_local_size(); + + // maximum required alignment is the size of the largest vector type + const size_t max_alignment = sizeof(double) * 16; + + // for arguments smaller than the maximum alignment simply align to the + // size of the argument + const size_t alignment = std::min(max_alignment, size); + + // align the argument + size_t alignedLocalOffset = localOffset; + if (localOffset % alignment != 0) { + alignedLocalOffset += alignment - (localOffset % alignment); + } + + add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset), + size + (alignedLocalOffset - localOffset)); + } + + void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) { + assert(size == sizeof(std::uint32_t) * 3); + std::memcpy(implicitOffsetArgs_, implicitOffset, size); + } + + void clear_local_size() { + std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0); + } + + const args_index_t &get_indices() const noexcept { return indices_; } + + uint32_t get_local_size() const { + return std::accumulate(std::begin(offsetPerIndex_), + std::end(offsetPerIndex_), 0); + } + } args_; + + ur_kernel_handle_t_(hipFunction_t func, hipFunction_t funcWithOffsetParam, + const char *name, ur_program_handle_t program, + ur_context_handle_t ctxt) + : function_{func}, functionWithOffsetParam_{funcWithOffsetParam}, + name_{name}, context_{ctxt}, program_{program}, refCount_{1} { + urProgramRetain(program_); + urContextRetain(context_); + } + + ur_kernel_handle_t_(hipFunction_t func, const char *name, + ur_program_handle_t program, ur_context_handle_t ctxt) + : ur_kernel_handle_t_{func, nullptr, name, program, ctxt} {} + + ~ur_kernel_handle_t_() { + urProgramRelease(program_); + urContextRelease(context_); + } + + ur_program_handle_t get_program() const noexcept { return program_; } + + uint32_t increment_reference_count() noexcept { return ++refCount_; } + + uint32_t decrement_reference_count() noexcept { return --refCount_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } + + native_type get() const noexcept { return function_; }; + + native_type get_with_offset_parameter() const noexcept { + return functionWithOffsetParam_; + }; + + bool has_with_offset_parameter() const noexcept { + return functionWithOffsetParam_ != nullptr; + } + + ur_context_handle_t get_context() const noexcept { return context_; }; + + const char *get_name() const noexcept { return name_.c_str(); } + + /// Returns the number of arguments, excluding the implicit global offset. + /// Note this only returns the current known number of arguments, not the + /// real one required by the kernel, since this cannot be queried from + /// the HIP Driver API + uint32_t get_num_args() const noexcept { return args_.indices_.size() - 1; } + + void set_kernel_arg(int index, size_t size, const void *arg) { + args_.add_arg(index, size, arg); + } + + void set_kernel_local_arg(int index, size_t size) { + args_.add_local_arg(index, size); + } + + void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) { + return args_.set_implicit_offset(size, implicitOffset); + } + + const arguments::args_index_t &get_arg_indices() const { + return args_.get_indices(); + } + + uint32_t get_local_size() const noexcept { return args_.get_local_size(); } + + void clear_local_size() { args_.clear_local_size(); } +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp new file mode 100644 index 0000000000000..9420c30982975 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp @@ -0,0 +1,301 @@ +//===--------- program.cpp - HIP Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "program.hpp" + +ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t ctxt) + : module_{nullptr}, binary_{}, + binarySizeInBytes_{0}, refCount_{1}, context_{ctxt} { + urContextRetain(context_); +} + +ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(context_); } + +ur_result_t ur_program_handle_t_::set_binary(const char *source, + size_t length) { + // Do not re-set program binary data which has already been set as that will + // delete the old binary data. + UR_ASSERT(binary_ == nullptr && binarySizeInBytes_ == 0, + UR_RESULT_ERROR_INVALID_OPERATION); + binary_ = source; + binarySizeInBytes_ = length; + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_program_handle_t_::build_program(const char *build_options) { + if (build_options) { + this->buildOptions_ = build_options; + } + + constexpr const unsigned int numberOfOptions = 4u; + + hipJitOption options[numberOfOptions]; + void *optionVals[numberOfOptions]; + + // Pass a buffer for info messages + options[0] = hipJitOptionInfoLogBuffer; + optionVals[0] = (void *)infoLog_; + // Pass the size of the info buffer + options[1] = hipJitOptionInfoLogBufferSizeBytes; + optionVals[1] = (void *)(long)MAX_LOG_SIZE; + // Pass a buffer for error message + options[2] = hipJitOptionErrorLogBuffer; + optionVals[2] = (void *)errorLog_; + // Pass the size of the error buffer + options[3] = hipJitOptionErrorLogBufferSizeBytes; + optionVals[3] = (void *)(long)MAX_LOG_SIZE; + + auto result = UR_CHECK_ERROR( + hipModuleLoadDataEx(&module_, static_cast(binary_), + numberOfOptions, options, optionVals)); + + const auto success = (result == UR_RESULT_SUCCESS); + + buildStatus_ = + success ? UR_PROGRAM_BUILD_STATUS_SUCCESS : UR_PROGRAM_BUILD_STATUS_ERROR; + + // If no exception, result is correct + return success ? UR_RESULT_SUCCESS : UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE; +} + +/// Finds kernel names by searching for entry points in the PTX source, as the +/// HIP driver API doesn't expose an operation for this. +/// Note: This is currently only being used by the SYCL program class for the +/// has_kernel method, so an alternative would be to move the has_kernel +/// query to UR and use hipModuleGetFunction to check for a kernel. +ur_result_t getKernelNames(ur_program_handle_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +/// HIP will handle the PTX/HIPBIN binaries internally through hipModule_t +/// object. So, urProgramCreateWithIL and urProgramCreateWithBinary are +/// equivalent in terms of HIP adapter. See \ref urProgramCreateWithBinary. +/// +UR_APIEXPORT ur_result_t UR_APICALL +urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, + size_t length, const ur_program_properties_t *pProperties, + ur_program_handle_t *phProgram) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_device_handle_t hDevice = hContext->get_device(); + auto pBinary = reinterpret_cast(pIL); + + return urProgramCreateWithBinary(hContext, hDevice, length, pBinary, + pProperties, phProgram); +} + +/// HIP will handle the PTX/HIPBIN binaries internally through a call to +/// hipModuleLoadDataEx. So, urProgramCompile and urProgramBuild are equivalent +/// in terms of CUDA adapter. \TODO Implement asynchronous compilation +/// +UR_APIEXPORT ur_result_t UR_APICALL +urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, + const char *pOptions) { + return urProgramBuild(hContext, hProgram, pOptions); +} + +/// Loads the images from a UR program into a CUmodule that can be +/// used later on to extract functions (kernels). +/// See \ref ur_program_handle_t for implementation details. +/// +UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, + ur_program_handle_t hProgram, + const char *pOptions) { + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retError = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hProgram->get_context()); + + hProgram->build_program(pOptions); + + } catch (ur_result_t err) { + retError = err; + } + return retError; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramLink(ur_context_handle_t hContext, uint32_t count, + const ur_program_handle_t *phPrograms, const char *pOptions, + ur_program_handle_t *phProgram) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +/// Created a UR program object from a HIP program handle. +/// TODO: Implement this. +/// NOTE: The created UR object takes ownership of the native handle. +/// +/// \param[in] nativeHandle The native handle to create UR program object from. +/// \param[in] context The UR context of the program. +/// \param[out] program Set to the UR program object created from native handle. +/// +/// \return TBD +UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( + ur_native_handle_t hNativeProgram, ur_context_handle_t hContext, + ur_program_handle_t *phProgram) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice, + ur_program_build_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + // Ignore unused parameter + (void)hDevice; + + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_PROGRAM_BUILD_INFO_STATUS: { + return ReturnValue(hProgram->buildStatus_); + } + case UR_PROGRAM_BUILD_INFO_OPTIONS: + return ReturnValue(hProgram->buildOptions_.c_str()); + case UR_PROGRAM_BUILD_INFO_LOG: + return ReturnValue(hProgram->infoLog_, hProgram->MAX_LOG_SIZE); + default: + break; + } + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, + size_t propSize, void *pProgramInfo, size_t *pPropSizeRet) { + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + UrReturnHelper ReturnValue(propSize, pProgramInfo, pPropSizeRet); + + switch (propName) { + case UR_PROGRAM_INFO_REFERENCE_COUNT: + return ReturnValue(hProgram->get_reference_count()); + case UR_PROGRAM_INFO_CONTEXT: + return ReturnValue(hProgram->context_); + case UR_PROGRAM_INFO_NUM_DEVICES: + return ReturnValue(1u); + case UR_PROGRAM_INFO_DEVICES: + return ReturnValue(&hProgram->context_->deviceId_, 1); + case UR_PROGRAM_INFO_SOURCE: + return ReturnValue(hProgram->binary_); + case UR_PROGRAM_INFO_BINARY_SIZES: + return ReturnValue(&hProgram->binarySizeInBytes_, 1); + case UR_PROGRAM_INFO_BINARIES: + return ReturnValue(&hProgram->binary_, 1); + case UR_PROGRAM_INFO_KERNEL_NAMES: + return getKernelNames(hProgram); + default: + break; + } + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramRetain(ur_program_handle_t program) { + UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(program->get_reference_count() > 0, + UR_RESULT_ERROR_INVALID_PROGRAM); + program->increment_reference_count(); + return UR_RESULT_SUCCESS; +} + +/// Decreases the reference count of a ur_program_handle_t object. +/// When the reference count reaches 0, it unloads the module from +/// the context. +UR_APIEXPORT ur_result_t UR_APICALL +urProgramRelease(ur_program_handle_t program) { + UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + UR_ASSERT(program->get_reference_count() != 0, + UR_RESULT_ERROR_INVALID_PROGRAM); + + // decrement ref count. If it is 0, delete the program. + if (program->decrement_reference_count() == 0) { + + std::unique_ptr program_ptr{program}; + + ur_result_t result = UR_RESULT_ERROR_INVALID_PROGRAM; + + try { + ScopedContext active(program->get_context()); + auto hipModule = program->get(); + result = UR_CHECK_ERROR(hipModuleUnload(hipModule)); + } catch (...) { + result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + return result; + } + + return UR_RESULT_SUCCESS; +} + +/// Gets the native HIP handle of a UR program object +/// +/// \param[in] program The UR program to get the native HIP object of. +/// \param[out] nativeHandle Set to the native handle of the UR program object. +/// +/// \return TBD +UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( + ur_program_handle_t program, ur_native_handle_t *nativeHandle) { + UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + *nativeHandle = reinterpret_cast(program->get()); + return UR_RESULT_SUCCESS; +} + +/// Loads images from a list of PTX or HIPBin binaries. +/// Note: No calls to HIP driver API in this function, only store binaries +/// for later. +/// +/// Note: Only supports one device +/// +UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( + ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, + const uint8_t *pBinary, const ur_program_properties_t *pProperties, + ur_program_handle_t *phProgram) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY); + UR_ASSERT(hContext->get_device()->get() == hDevice->get(), + UR_RESULT_ERROR_INVALID_CONTEXT); + + ur_result_t retError = UR_RESULT_SUCCESS; + + std::unique_ptr retProgram{ + new ur_program_handle_t_{hContext}}; + + // TODO: Set metadata here and use reqd_work_group_size information. + // See urProgramCreateWithBinary in CUDA adapter. + + auto pBinary_string = reinterpret_cast(pBinary); + if (size == 0) { + size = strlen(pBinary_string) + 1; + } + + UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE); + + retError = retProgram->set_binary(pBinary_string, size); + UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); + + *phProgram = retProgram.release(); + + return retError; +} + +// This entry point is only used for native specialization constants (SPIR-V), +// and the CUDA plugin is AOT only so this entry point is not supported. +UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants( + ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp new file mode 100644 index 0000000000000..d84f888c755d0 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp @@ -0,0 +1,45 @@ +//===--------- program.hpp - HIP Adapter ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include + +#include + +#include "context.hpp" + +struct ur_program_handle_t_ : _ur_object { + using native_type = hipModule_t; + native_type module_; + const char *binary_; + size_t binarySizeInBytes_; + std::atomic_uint32_t refCount_; + ur_context_handle_t context_; + + constexpr static size_t MAX_LOG_SIZE = 8192u; + + char errorLog_[MAX_LOG_SIZE], infoLog_[MAX_LOG_SIZE]; + std::string buildOptions_; + ur_program_build_status_t buildStatus_ = UR_PROGRAM_BUILD_STATUS_NONE; + + ur_program_handle_t_(ur_context_handle_t ctxt); + ~ur_program_handle_t_(); + + ur_result_t set_binary(const char *binary, size_t binarySizeInBytes); + + ur_result_t build_program(const char *build_options); + ur_context_handle_t get_context() const { return context_; }; + + native_type get() const noexcept { return module_; }; + + uint32_t increment_reference_count() noexcept { return ++refCount_; } + + uint32_t decrement_reference_count() noexcept { return --refCount_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index 49fd29262db78..cb90edcd81c0b 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -83,19 +83,20 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnBuild = nullptr; - pDdiTable->pfnCompile = nullptr; - pDdiTable->pfnCreateWithBinary = nullptr; - pDdiTable->pfnCreateWithIL = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetBuildInfo = nullptr; + pDdiTable->pfnBuild = urProgramBuild; + pDdiTable->pfnCompile = urProgramCompile; + pDdiTable->pfnCreateWithBinary = urProgramCreateWithBinary; + pDdiTable->pfnCreateWithIL = urProgramCreateWithIL; + pDdiTable->pfnCreateWithNativeHandle = urProgramCreateWithNativeHandle; + pDdiTable->pfnGetBuildInfo = urProgramGetBuildInfo; pDdiTable->pfnGetFunctionPointer = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnLink = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; - pDdiTable->pfnSetSpecializationConstants = nullptr; + pDdiTable->pfnGetInfo = urProgramGetInfo; + pDdiTable->pfnGetNativeHandle = urProgramGetNativeHandle; + pDdiTable->pfnLink = urProgramLink; + pDdiTable->pfnRelease = urProgramRelease; + pDdiTable->pfnRetain = urProgramRetain; + pDdiTable->pfnSetSpecializationConstants = + urProgramSetSpecializationConstants; return UR_RESULT_SUCCESS; } @@ -105,20 +106,20 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnCreate = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetGroupInfo = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnGetSubGroupInfo = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnCreate = urKernelCreate; + pDdiTable->pfnCreateWithNativeHandle = urKernelCreateWithNativeHandle; + pDdiTable->pfnGetGroupInfo = urKernelGetGroupInfo; + pDdiTable->pfnGetInfo = urKernelGetInfo; + pDdiTable->pfnGetNativeHandle = urKernelGetNativeHandle; + pDdiTable->pfnGetSubGroupInfo = urKernelGetSubGroupInfo; + pDdiTable->pfnRelease = urKernelRelease; + pDdiTable->pfnRetain = urKernelRetain; pDdiTable->pfnSetArgLocal = nullptr; pDdiTable->pfnSetArgMemObj = nullptr; - pDdiTable->pfnSetArgPointer = nullptr; + pDdiTable->pfnSetArgPointer = urKernelSetArgPointer; pDdiTable->pfnSetArgSampler = nullptr; - pDdiTable->pfnSetArgValue = nullptr; - pDdiTable->pfnSetExecInfo = nullptr; + pDdiTable->pfnSetArgValue = urKernelSetArgValue; + pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = nullptr; return UR_RESULT_SUCCESS; } From cd77582837b72a727d9ae95d144120ea8d18f27b Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Tue, 16 May 2023 16:16:06 +0100 Subject: [PATCH 07/42] [SYCL][HIP][PI][UR] Port queue event entry points --- sycl/plugins/hip/CMakeLists.txt | 4 + sycl/plugins/hip/pi_hip.cpp | 966 ++++-------------- sycl/plugins/hip/pi_hip.hpp | 411 ++------ sycl/plugins/unified_runtime/CMakeLists.txt | 4 + .../ur/adapters/hip/context.hpp | 10 +- .../unified_runtime/ur/adapters/hip/event.cpp | 331 ++++++ .../unified_runtime/ur/adapters/hip/event.hpp | 177 ++++ .../unified_runtime/ur/adapters/hip/queue.cpp | 300 ++++++ .../unified_runtime/ur/adapters/hip/queue.hpp | 243 +++++ .../ur/adapters/hip/ur_interface_loader.cpp | 32 +- 10 files changed, 1312 insertions(+), 1166 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt index 147ba1b9fac97..f90d510ecb7f5 100644 --- a/sycl/plugins/hip/CMakeLists.txt +++ b/sycl/plugins/hip/CMakeLists.txt @@ -98,6 +98,8 @@ add_sycl_plugin(hip "../unified_runtime/ur/adapters/hip/context.hpp" "../unified_runtime/ur/adapters/hip/device.cpp" "../unified_runtime/ur/adapters/hip/device.hpp" + "../unified_runtime/ur/adapters/hip/event.cpp" + "../unified_runtime/ur/adapters/hip/event.hpp" "../unified_runtime/ur/adapters/hip/platform.cpp" "../unified_runtime/ur/adapters/hip/platform.hpp" "../unified_runtime/ur/adapters/hip/memory.cpp" @@ -109,6 +111,8 @@ add_sycl_plugin(hip "../unified_runtime/ur/adapters/hip/program.hpp" "../unified_runtime/ur/adapters/hip/kernel.cpp" "../unified_runtime/ur/adapters/hip/kernel.hpp" + "../unified_runtime/ur/adapters/hip/queue.cpp" + "../unified_runtime/ur/adapters/hip/queue.hpp" "../unified_runtime/ur/adapters/hip/ur_interface_loader.cpp" "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 074e8c0e31221..746cc82a73b1b 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -47,6 +47,109 @@ pi_result map_error(hipError_t result) { } } +// TODO(ur) - this can be removed once more of pi entry points are ported to UR. +pi_result map_ur_error(ur_result_t result) { + + switch (result) { +#define CASE(UR_ERR, PI_ERR) \ + case UR_ERR: \ + return PI_ERR; + + CASE(UR_RESULT_SUCCESS, PI_SUCCESS) + CASE(UR_RESULT_ERROR_INVALID_OPERATION, PI_ERROR_INVALID_OPERATION) + CASE(UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES, + PI_ERROR_INVALID_QUEUE_PROPERTIES) + CASE(UR_RESULT_ERROR_INVALID_QUEUE, PI_ERROR_INVALID_QUEUE) + CASE(UR_RESULT_ERROR_INVALID_VALUE, PI_ERROR_INVALID_VALUE) + CASE(UR_RESULT_ERROR_INVALID_CONTEXT, PI_ERROR_INVALID_CONTEXT) + CASE(UR_RESULT_ERROR_INVALID_PLATFORM, PI_ERROR_INVALID_PLATFORM) + CASE(UR_RESULT_ERROR_INVALID_BINARY, PI_ERROR_INVALID_BINARY) + CASE(UR_RESULT_ERROR_INVALID_PROGRAM, PI_ERROR_INVALID_BINARY) + CASE(UR_RESULT_ERROR_INVALID_SAMPLER, PI_ERROR_INVALID_SAMPLER) + CASE(UR_RESULT_ERROR_INVALID_BUFFER_SIZE, PI_ERROR_INVALID_BUFFER_SIZE) + CASE(UR_RESULT_ERROR_INVALID_MEM_OBJECT, PI_ERROR_INVALID_MEM_OBJECT) + CASE(UR_RESULT_ERROR_INVALID_EVENT, PI_ERROR_INVALID_EVENT) + CASE(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST, + PI_ERROR_INVALID_EVENT_WAIT_LIST) + CASE(UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET, + PI_ERROR_MISALIGNED_SUB_BUFFER_OFFSET) + CASE(UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE, + PI_ERROR_INVALID_WORK_GROUP_SIZE) + CASE(UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE, + PI_ERROR_COMPILER_NOT_AVAILABLE) + CASE(UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE, + PI_ERROR_PROFILING_INFO_NOT_AVAILABLE) + CASE(UR_RESULT_ERROR_DEVICE_NOT_FOUND, PI_ERROR_DEVICE_NOT_FOUND) + CASE(UR_RESULT_ERROR_INVALID_DEVICE, PI_ERROR_INVALID_DEVICE) + CASE(UR_RESULT_ERROR_DEVICE_LOST, PI_ERROR_DEVICE_NOT_AVAILABLE) + // UR_RESULT_ERROR_DEVICE_REQUIRES_RESET + // UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE + CASE(UR_RESULT_ERROR_DEVICE_PARTITION_FAILED, + PI_ERROR_DEVICE_PARTITION_FAILED) + CASE(UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT, + PI_ERROR_INVALID_DEVICE_PARTITION_COUNT) + CASE(UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE, + PI_ERROR_INVALID_WORK_ITEM_SIZE) + CASE(UR_RESULT_ERROR_INVALID_WORK_DIMENSION, + PI_ERROR_INVALID_WORK_DIMENSION) + CASE(UR_RESULT_ERROR_INVALID_KERNEL_ARGS, PI_ERROR_INVALID_KERNEL_ARGS) + CASE(UR_RESULT_ERROR_INVALID_KERNEL, PI_ERROR_INVALID_KERNEL) + CASE(UR_RESULT_ERROR_INVALID_KERNEL_NAME, PI_ERROR_INVALID_KERNEL_NAME) + CASE(UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX, + PI_ERROR_INVALID_ARG_INDEX) + CASE(UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE, + PI_ERROR_INVALID_ARG_SIZE) + // UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE + CASE(UR_RESULT_ERROR_INVALID_IMAGE_SIZE, PI_ERROR_INVALID_IMAGE_SIZE) + CASE(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, + PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR) + CASE(UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED, + PI_ERROR_IMAGE_FORMAT_NOT_SUPPORTED) + CASE(UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE, + PI_ERROR_MEM_OBJECT_ALLOCATION_FAILURE) + CASE(UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE, + PI_ERROR_INVALID_PROGRAM_EXECUTABLE) + CASE(UR_RESULT_ERROR_UNINITIALIZED, PI_ERROR_UNINITIALIZED) + CASE(UR_RESULT_ERROR_OUT_OF_HOST_MEMORY, PI_ERROR_OUT_OF_HOST_MEMORY) + CASE(UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, PI_ERROR_OUT_OF_RESOURCES) + CASE(UR_RESULT_ERROR_OUT_OF_RESOURCES, PI_ERROR_OUT_OF_RESOURCES) + CASE(UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE, PI_ERROR_BUILD_PROGRAM_FAILURE) + CASE(UR_RESULT_ERROR_PROGRAM_LINK_FAILURE, PI_ERROR_LINK_PROGRAM_FAILURE) + // UR_RESULT_ERROR_UNSUPPORTED_VERSION + // UR_RESULT_ERROR_UNSUPPORTED_FEATURE + CASE(UR_RESULT_ERROR_INVALID_ARGUMENT, PI_ERROR_INVALID_ARG_VALUE) + CASE(UR_RESULT_ERROR_INVALID_NULL_HANDLE, PI_ERROR_INVALID_VALUE) + // UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE + CASE(UR_RESULT_ERROR_INVALID_NULL_POINTER, PI_ERROR_INVALID_VALUE) + CASE(UR_RESULT_ERROR_INVALID_SIZE, PI_ERROR_INVALID_VALUE) + CASE(UR_RESULT_ERROR_UNSUPPORTED_SIZE, PI_ERROR_INVALID_VALUE) + CASE(UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, PI_ERROR_INVALID_VALUE) + CASE(UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT, PI_ERROR_INVALID_VALUE) + CASE(UR_RESULT_ERROR_INVALID_ENUMERATION, PI_ERROR_INVALID_VALUE) + CASE(UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION, PI_ERROR_INVALID_VALUE) + CASE(UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT, + PI_ERROR_IMAGE_FORMAT_MISMATCH) + CASE(UR_RESULT_ERROR_INVALID_NATIVE_BINARY, PI_ERROR_INVALID_BINARY) + CASE(UR_RESULT_ERROR_INVALID_GLOBAL_NAME, PI_ERROR_INVALID_VALUE) + CASE(UR_RESULT_ERROR_INVALID_FUNCTION_NAME, PI_ERROR_INVALID_VALUE) + CASE(UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION, + PI_ERROR_INVALID_WORK_GROUP_SIZE) + CASE(UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION, + PI_ERROR_INVALID_GLOBAL_WORK_SIZE) + // UR_RESULT_ERROR_PROGRAM_UNLINKED + // UR_RESULT_ERROR_OVERLAPPING_REGIONS + CASE(UR_RESULT_ERROR_INVALID_HOST_PTR, PI_ERROR_INVALID_HOST_PTR) + // UR_RESULT_ERROR_INVALID_USM_SIZE + CASE(UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE, + PI_ERROR_MEM_OBJECT_ALLOCATION_FAILURE) + CASE(UR_RESULT_ERROR_ADAPTER_SPECIFIC, PI_ERROR_PLUGIN_SPECIFIC_ERROR) + +#undef CASE + default: + return PI_ERROR_UNKNOWN; + } +} + // Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR constexpr size_t MaxMessageSize = 256; thread_local pi_result ErrorMessageCode = PI_SUCCESS; @@ -84,51 +187,6 @@ pi_result hip_piPluginGetBackendOption(pi_platform, const char *frontend_option, return PI_ERROR_INVALID_VALUE; } -// Iterates over the event wait list, returns correct pi_result error codes. -// Invokes the callback for the latest event of each queue in the wait list. -// The callback must take a single pi_event argument and return a pi_result. -template -pi_result forLatestEvents(const pi_event *event_wait_list, - std::size_t num_events_in_wait_list, Func &&f) { - - if (event_wait_list == nullptr || num_events_in_wait_list == 0) { - return PI_ERROR_INVALID_EVENT_WAIT_LIST; - } - - // Fast path if we only have a single event - if (num_events_in_wait_list == 1) { - return f(event_wait_list[0]); - } - - std::vector events{event_wait_list, - event_wait_list + num_events_in_wait_list}; - std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) { - // Tiered sort creating sublists of streams (smallest value first) in which - // the corresponding events are sorted into a sequence of newest first. - return e0->get_stream() < e1->get_stream() || - (e0->get_stream() == e1->get_stream() && - e0->get_event_id() > e1->get_event_id()); - }); - - bool first = true; - hipStream_t lastSeenStream = 0; - for (pi_event event : events) { - if (!event || (!first && event->get_stream() == lastSeenStream)) { - continue; - } - - first = false; - lastSeenStream = event->get_stream(); - - auto result = f(event); - if (result != PI_SUCCESS) { - return result; - } - } - - return PI_SUCCESS; -} - /// Converts HIP error into PI error codes, and outputs error information /// to stderr. /// If PI_HIP_ABORT env variable is defined, it aborts directly instead of @@ -224,29 +282,6 @@ pi_result getInfo(size_t param_value_size, void *param_value, param_value_size_ret, value); } -ScopedContext::ScopedContext(pi_context ctxt) - : placedContext_{ctxt}, needToRecover_{false} { - if (!placedContext_) { - throw PI_ERROR_INVALID_CONTEXT; - } - - hipCtx_t desired = placedContext_->get(); - PI_CHECK_ERROR(hipCtxGetCurrent(&original_)); - if (original_ != desired) { - // Sets the desired context as the active one for the thread - PI_CHECK_ERROR(hipCtxSetCurrent(desired)); - if (original_ == nullptr) { - // No context is installed on the current thread - // This is the most common case. We can activate the context in the - // thread and leave it there until all the PI context referring to the - // same underlying HIP context are destroyed. This emulates - // the behaviour of the HIP runtime api, and avoids costly context - // switches. No action is required on this side of the if. - } else { - needToRecover_ = true; - } - } -} /// \endcond void simpleGuessLocalWorkSize(size_t *threadsPerBlock, @@ -282,15 +317,16 @@ pi_result enqueueEventsWait(pi_queue command_queue, hipStream_t stream, try { ScopedContext active(command_queue->get_context()); - auto result = forLatestEvents( - event_wait_list, num_events_in_wait_list, - [stream](pi_event event) -> pi_result { + auto result = map_ur_error(forLatestEvents( + reinterpret_cast(event_wait_list), + num_events_in_wait_list, + [stream](ur_event_handle_t event) -> ur_result_t { if (event->get_stream() == stream) { - return PI_SUCCESS; + return UR_RESULT_SUCCESS; } else { - return PI_CHECK_ERROR(hipStreamWaitEvent(stream, event->get(), 0)); + return UR_CHECK_ERROR(hipStreamWaitEvent(stream, event->get(), 0)); } - }); + })); if (result != PI_SUCCESS) { return result; @@ -349,271 +385,10 @@ pi_result hip_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); -pi_result hip_piEventRelease(pi_event event); -pi_result hip_piEventRetain(pi_event event); - } // extern "C" /// \endcond -void _pi_queue::compute_stream_wait_for_barrier_if_needed(hipStream_t stream, - pi_uint32 stream_i) { - if (barrier_event_ && !compute_applied_barrier_[stream_i]) { - PI_CHECK_ERROR(hipStreamWaitEvent(stream, barrier_event_, 0)); - compute_applied_barrier_[stream_i] = true; - } -} - -void _pi_queue::transfer_stream_wait_for_barrier_if_needed(hipStream_t stream, - pi_uint32 stream_i) { - if (barrier_event_ && !transfer_applied_barrier_[stream_i]) { - PI_CHECK_ERROR(hipStreamWaitEvent(stream, barrier_event_, 0)); - transfer_applied_barrier_[stream_i] = true; - } -} - -hipStream_t _pi_queue::get_next_compute_stream(pi_uint32 *stream_token) { - pi_uint32 stream_i; - pi_uint32 token; - while (true) { - if (num_compute_streams_ < compute_streams_.size()) { - // the check above is for performance - so as not to lock mutex every time - std::lock_guard guard(compute_stream_mutex_); - // The second check is done after mutex is locked so other threads can not - // change num_compute_streams_ after that - if (num_compute_streams_ < compute_streams_.size()) { - PI_CHECK_ERROR(hipStreamCreateWithFlags( - &compute_streams_[num_compute_streams_++], flags_)); - } - } - token = compute_stream_idx_++; - stream_i = token % compute_streams_.size(); - // if a stream has been reused before it was next selected round-robin - // fashion, we want to delay its next use and instead select another one - // that is more likely to have completed all the enqueued work. - if (delay_compute_[stream_i]) { - delay_compute_[stream_i] = false; - } else { - break; - } - } - if (stream_token) { - *stream_token = token; - } - hipStream_t res = compute_streams_[stream_i]; - compute_stream_wait_for_barrier_if_needed(res, stream_i); - return res; -} - -hipStream_t _pi_queue::get_next_compute_stream( - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - _pi_stream_guard &guard, pi_uint32 *stream_token) { - for (pi_uint32 i = 0; i < num_events_in_wait_list; i++) { - pi_uint32 token = event_wait_list[i]->get_compute_stream_token(); - if (event_wait_list[i]->get_queue() == this && can_reuse_stream(token)) { - std::unique_lock compute_sync_guard( - compute_stream_sync_mutex_); - // redo the check after lock to avoid data races on - // last_sync_compute_streams_ - if (can_reuse_stream(token)) { - pi_uint32 stream_i = token % delay_compute_.size(); - delay_compute_[stream_i] = true; - if (stream_token) { - *stream_token = token; - } - guard = _pi_stream_guard{std::move(compute_sync_guard)}; - hipStream_t res = event_wait_list[i]->get_stream(); - compute_stream_wait_for_barrier_if_needed(res, stream_i); - return res; - } - } - } - guard = {}; - return get_next_compute_stream(stream_token); -} - -hipStream_t _pi_queue::get_next_transfer_stream() { - if (transfer_streams_.empty()) { // for example in in-order queue - return get_next_compute_stream(); - } - if (num_transfer_streams_ < transfer_streams_.size()) { - // the check above is for performance - so as not to lock mutex every time - std::lock_guard guard(transfer_stream_mutex_); - // The second check is done after mutex is locked so other threads can not - // change num_transfer_streams_ after that - if (num_transfer_streams_ < transfer_streams_.size()) { - PI_CHECK_ERROR(hipStreamCreateWithFlags( - &transfer_streams_[num_transfer_streams_++], flags_)); - } - } - pi_uint32 stream_i = transfer_stream_idx_++ % transfer_streams_.size(); - hipStream_t res = transfer_streams_[stream_i]; - transfer_stream_wait_for_barrier_if_needed(res, stream_i); - return res; -} - -_pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue, - hipStream_t stream, pi_uint32 stream_token) - : commandType_{type}, refCount_{1}, hasBeenWaitedOn_{false}, - isRecorded_{false}, isStarted_{false}, - streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr}, - evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} { - - assert(type != PI_COMMAND_TYPE_USER); - - bool profilingEnabled = queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE; - - PI_CHECK_ERROR(hipEventCreateWithFlags( - &evEnd_, profilingEnabled ? hipEventDefault : hipEventDisableTiming)); - - if (profilingEnabled) { - PI_CHECK_ERROR(hipEventCreateWithFlags(&evQueued_, hipEventDefault)); - PI_CHECK_ERROR(hipEventCreateWithFlags(&evStart_, hipEventDefault)); - } - - if (queue_ != nullptr) { - hip_piQueueRetain(queue_); - } - pi2ur::piContextRetain(context_); -} - -_pi_event::~_pi_event() { - if (queue_ != nullptr) { - hip_piQueueRelease(queue_); - } - pi2ur::piContextRelease(context_); -} - -pi_result _pi_event::start() { - assert(!is_started()); - pi_result result = PI_SUCCESS; - - try { - if (queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE) { - // NOTE: This relies on the default stream to be unused. - PI_CHECK_ERROR(hipEventRecord(evQueued_, 0)); - PI_CHECK_ERROR(hipEventRecord(evStart_, queue_->get())); - } - } catch (pi_result error) { - result = error; - } - - isStarted_ = true; - return result; -} - -bool _pi_event::is_completed() const noexcept { - if (!isRecorded_) { - return false; - } - if (!hasBeenWaitedOn_) { - const hipError_t ret = hipEventQuery(evEnd_); - if (ret != hipSuccess && ret != hipErrorNotReady) { - PI_CHECK_ERROR(ret); - return false; - } - if (ret == hipErrorNotReady) { - return false; - } - } - return true; -} - -pi_uint64 _pi_event::get_queued_time() const { - float miliSeconds = 0.0f; - assert(is_started()); - - // hipEventSynchronize waits till the event is ready for call to - // hipEventElapsedTime. - PI_CHECK_ERROR(hipEventSynchronize(evStart_)); - PI_CHECK_ERROR(hipEventSynchronize(evEnd_)); - - PI_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, evStart_, evEnd_)); - return static_cast(miliSeconds * 1.0e6); -} - -pi_uint64 _pi_event::get_start_time() const { - float miliSeconds = 0.0f; - assert(is_started()); - - // hipEventSynchronize waits till the event is ready for call to - // hipEventElapsedTime. - PI_CHECK_ERROR(hipEventSynchronize(_pi_platform::evBase_)); - PI_CHECK_ERROR(hipEventSynchronize(evStart_)); - - PI_CHECK_ERROR( - hipEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evStart_)); - return static_cast(miliSeconds * 1.0e6); -} - -pi_uint64 _pi_event::get_end_time() const { - float miliSeconds = 0.0f; - assert(is_started() && is_recorded()); - - // hipEventSynchronize waits till the event is ready for call to - // hipEventElapsedTime. - PI_CHECK_ERROR(hipEventSynchronize(_pi_platform::evBase_)); - PI_CHECK_ERROR(hipEventSynchronize(evEnd_)); - - PI_CHECK_ERROR( - hipEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evEnd_)); - return static_cast(miliSeconds * 1.0e6); -} - -pi_result _pi_event::record() { - - if (is_recorded() || !is_started()) { - return PI_ERROR_INVALID_EVENT; - } - - pi_result result = PI_ERROR_INVALID_OPERATION; - - if (!queue_) { - return PI_ERROR_INVALID_QUEUE; - } - - try { - eventId_ = queue_->get_next_event_id(); - if (eventId_ == 0) { - sycl::detail::pi::die( - "Unrecoverable program state reached in event identifier overflow"); - } - result = PI_CHECK_ERROR(hipEventRecord(evEnd_, stream_)); - } catch (pi_result error) { - result = error; - } - - if (result == PI_SUCCESS) { - isRecorded_ = true; - } - - return result; -} - -pi_result _pi_event::wait() { - pi_result retErr; - try { - retErr = PI_CHECK_ERROR(hipEventSynchronize(evEnd_)); - hasBeenWaitedOn_ = true; - } catch (pi_result error) { - retErr = error; - } - - return retErr; -} - -pi_result _pi_event::release() { - assert(queue_ != nullptr); - PI_CHECK_ERROR(hipEventDestroy(evEnd_)); - - if (queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE) { - PI_CHECK_ERROR(hipEventDestroy(evQueued_)); - PI_CHECK_ERROR(hipEventDestroy(evStart_)); - } - - return PI_SUCCESS; -} - // makes all future work submitted to queue wait for all work captured in event. pi_result enqueueEventWait(pi_queue queue, pi_event event) { // for native events, the hipStreamWaitEvent call is used. @@ -686,216 +461,6 @@ pi_result hip_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device, return retError; } -/// Creates a `pi_queue` object on the HIP backend. -/// Valid properties -/// * __SYCL_PI_HIP_USE_DEFAULT_STREAM -> hipStreamDefault -/// * __SYCL_PI_HIP_SYNC_WITH_DEFAULT -> hipStreamNonBlocking -/// \return Pi queue object mapping to a HIPStream -/// -pi_result hip_piQueueCreate(pi_context context, pi_device device, - pi_queue_properties properties, pi_queue *queue) { - try { - std::unique_ptr<_pi_queue> queueImpl{nullptr}; - - if (context->get_device() != device) { - *queue = nullptr; - return PI_ERROR_INVALID_DEVICE; - } - - unsigned int flags = 0; - - const bool is_out_of_order = - properties & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; - - std::vector computeHipStreams( - is_out_of_order ? _pi_queue::default_num_compute_streams : 1); - std::vector transferHipStreams( - is_out_of_order ? _pi_queue::default_num_transfer_streams : 0); - - queueImpl = std::unique_ptr<_pi_queue>(new _pi_queue{ - std::move(computeHipStreams), std::move(transferHipStreams), context, - device, properties, flags}); - - *queue = queueImpl.release(); - - return PI_SUCCESS; - } catch (pi_result err) { - - return err; - - } catch (...) { - - return PI_ERROR_OUT_OF_RESOURCES; - } -} -pi_result hip_piextQueueCreate(pi_context Context, pi_device Device, - pi_queue_properties *Properties, - pi_queue *Queue) { - assert(Properties); - // Expect flags mask to be passed first. - assert(Properties[0] == PI_QUEUE_FLAGS); - if (Properties[0] != PI_QUEUE_FLAGS) - return PI_ERROR_INVALID_VALUE; - pi_queue_properties Flags = Properties[1]; - // Extra data isn't supported yet. - assert(Properties[2] == 0); - if (Properties[2] != 0) - return PI_ERROR_INVALID_VALUE; - return hip_piQueueCreate(Context, Device, Flags, Queue); -} - -pi_result hip_piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - assert(command_queue != nullptr); - - switch (param_name) { - case PI_QUEUE_INFO_CONTEXT: - return getInfo(param_value_size, param_value, param_value_size_ret, - command_queue->context_); - case PI_QUEUE_INFO_DEVICE: - return getInfo(param_value_size, param_value, param_value_size_ret, - command_queue->device_); - case PI_QUEUE_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, param_value_size_ret, - command_queue->get_reference_count()); - case PI_QUEUE_INFO_PROPERTIES: - return getInfo(param_value_size, param_value, param_value_size_ret, - command_queue->properties_); - case PI_EXT_ONEAPI_QUEUE_INFO_EMPTY: { - bool IsReady = command_queue->all_of([](hipStream_t s) -> bool { - const hipError_t ret = hipStreamQuery(s); - if (ret == hipSuccess) - return true; - - if (ret == hipErrorNotReady) - return false; - - PI_CHECK_ERROR(ret); - return false; - }); - return getInfo(param_value_size, param_value, param_value_size_ret, - IsReady); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - sycl::detail::pi::die("Queue info request not implemented"); - return {}; -} - -pi_result hip_piQueueRetain(pi_queue command_queue) { - assert(command_queue != nullptr); - assert(command_queue->get_reference_count() > 0); - - command_queue->increment_reference_count(); - return PI_SUCCESS; -} - -pi_result hip_piQueueRelease(pi_queue command_queue) { - assert(command_queue != nullptr); - - if (command_queue->decrement_reference_count() > 0) { - return PI_SUCCESS; - } - - try { - std::unique_ptr<_pi_queue> queueImpl(command_queue); - - ScopedContext active(command_queue->get_context()); - - command_queue->for_each_stream([](hipStream_t s) { - PI_CHECK_ERROR(hipStreamSynchronize(s)); - PI_CHECK_ERROR(hipStreamDestroy(s)); - }); - - return PI_SUCCESS; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_OUT_OF_RESOURCES; - } -} - -pi_result hip_piQueueFinish(pi_queue command_queue) { - - // set default result to a negative result (avoid false-positve tests) - pi_result result = PI_ERROR_OUT_OF_HOST_MEMORY; - - try { - - assert(command_queue != - nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code - ScopedContext active(command_queue->get_context()); - - command_queue->sync_streams([&result](hipStream_t s) { - result = PI_CHECK_ERROR(hipStreamSynchronize(s)); - }); - - } catch (pi_result err) { - - result = err; - - } catch (...) { - - result = PI_ERROR_OUT_OF_RESOURCES; - } - - return result; -} - -// There is no HIP counterpart for queue flushing and we don't run into the -// same problem of having to flush cross-queue dependencies as some of the -// other plugins, so it can be left as no-op. -pi_result hip_piQueueFlush(pi_queue command_queue) { - (void)command_queue; - return PI_SUCCESS; -} - -/// Gets the native HIP handle of a PI queue object -/// -/// \param[in] queue The PI queue to get the native HIP object of. -/// \param[out] nativeHandle Set to the native handle of the PI queue object. -/// -/// \return PI_SUCCESS -pi_result hip_piextQueueGetNativeHandle(pi_queue queue, - pi_native_handle *nativeHandle, - int32_t *NativeHandleDesc) { - *NativeHandleDesc = 0; - ScopedContext active(queue->get_context()); - *nativeHandle = - reinterpret_cast(queue->get_next_compute_stream()); - return PI_SUCCESS; -} - -/// Created a PI queue object from a HIP queue handle. -/// TODO: Implement this. -/// NOTE: The created PI object takes ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI queue object from. -/// \param[in] context is the PI context of the queue. -/// \param[out] queue Set to the PI queue object created from native handle. -/// \param ownNativeHandle tells if SYCL RT should assume the ownership of -/// the native handle, if it can. -/// -/// -/// \return TBD -pi_result hip_piextQueueCreateWithNativeHandle( - pi_native_handle nativeHandle, int32_t NativeHandleDesc, pi_context context, - pi_device device, bool ownNativeHandle, pi_queue_properties *Properties, - pi_queue *queue) { - (void)nativeHandle; - (void)NativeHandleDesc; - (void)context; - (void)device; - (void)ownNativeHandle; - (void)Properties; - (void)queue; - sycl::detail::pi::die( - "Creation of PI queue from native handle not implemented"); - return {}; -} - pi_result hip_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, size_t offset, size_t size, void *ptr, @@ -925,7 +490,7 @@ pi_result hip_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, ptr, size, hipStream)); if (event) { - retErr = retImplEv->record(); + retErr = map_ur_error(retImplEv->record()); } if (blocking_write) { @@ -970,7 +535,7 @@ pi_result hip_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, hipStream)); if (event) { - retErr = retImplEv->record(); + retErr = map_ur_error(retImplEv->record()); } if (blocking_read) { @@ -987,41 +552,6 @@ pi_result hip_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, return retErr; } -pi_result hip_piEventsWait(pi_uint32 num_events, const pi_event *event_list) { - - try { - assert(num_events != 0); - assert(event_list); - if (num_events == 0) { - return PI_ERROR_INVALID_VALUE; - } - - if (!event_list) { - return PI_ERROR_INVALID_EVENT; - } - - auto context = event_list[0]->get_context(); - ScopedContext active(context); - - auto waitFunc = [context](pi_event event) -> pi_result { - if (!event) { - return PI_ERROR_INVALID_EVENT; - } - - if (event->get_context() != context) { - return PI_ERROR_INVALID_CONTEXT; - } - - return event->wait(); - }; - return forLatestEvents(event_list, num_events, waitFunc); - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_OUT_OF_RESOURCES; - } -} - pi_result hip_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem *arg_value) { @@ -1109,14 +639,16 @@ pi_result hip_piEnqueueKernelLaunch( { pi_result retError = pi2ur::piDeviceGetInfo( - command_queue->device_, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES, - sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr); + reinterpret_cast(command_queue->device_), + PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES, sizeof(maxThreadsPerBlock), + maxThreadsPerBlock, nullptr); assert(retError == PI_SUCCESS); (void)retError; retError = pi2ur::piDeviceGetInfo( - command_queue->device_, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE, - sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr); + reinterpret_cast(command_queue->device_), + PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE, sizeof(maxWorkGroupSize), + &maxWorkGroupSize, nullptr); assert(retError == PI_SUCCESS); // The maxWorkGroupsSize = 1024 for AMD GPU // The maxThreadsPerBlock = {1024, 1024, 1024} @@ -1168,7 +700,9 @@ pi_result hip_piEnqueueKernelLaunch( pi_uint32 stream_token; _pi_stream_guard guard; hipStream_t hipStream = command_queue->get_next_compute_stream( - num_events_in_wait_list, event_wait_list, guard, &stream_token); + num_events_in_wait_list, + reinterpret_cast(event_wait_list), guard, + &stream_token); hipFunction_t hipFunc = kernel->get(); retError = enqueueEventsWait(command_queue, hipStream, @@ -1228,7 +762,7 @@ pi_result hip_piEnqueueKernelLaunch( kernel->clear_local_size(); if (event) { - retError = retImplEv->record(); + retError = map_ur_error(retImplEv->record()); *event = retImplEv.release(); } } catch (pi_result err) { @@ -1259,136 +793,6 @@ hip_piEnqueueNativeKernel(pi_queue queue, void (*user_func)(void *), void *args, return {}; } -// -// Events -// -pi_result hip_piEventCreate(pi_context context, pi_event *event) { - (void)context; - (void)event; - - sycl::detail::pi::die("PI Event Create not implemented in HIP backend"); -} - -pi_result hip_piEventGetInfo(pi_event event, pi_event_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - assert(event != nullptr); - - switch (param_name) { - case PI_EVENT_INFO_COMMAND_QUEUE: - return getInfo(param_value_size, param_value, param_value_size_ret, - event->get_queue()); - case PI_EVENT_INFO_COMMAND_TYPE: - return getInfo(param_value_size, param_value, param_value_size_ret, - event->get_command_type()); - case PI_EVENT_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, param_value_size_ret, - event->get_reference_count()); - case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: { - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(event->get_execution_status())); - } - case PI_EVENT_INFO_CONTEXT: - return getInfo(param_value_size, param_value, param_value_size_ret, - event->get_context()); - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - - return PI_ERROR_INVALID_EVENT; -} - -/// Obtain profiling information from PI HIP events -/// Timings from HIP are only elapsed time. -pi_result hip_piEventGetProfilingInfo(pi_event event, - pi_profiling_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) { - - assert(event != nullptr); - - pi_queue queue = event->get_queue(); - if (queue == nullptr || - !(queue->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE)) { - return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE; - } - - switch (param_name) { - case PI_PROFILING_INFO_COMMAND_QUEUED: - case PI_PROFILING_INFO_COMMAND_SUBMIT: - // Note: No user for this case - return getInfo(param_value_size, param_value, - param_value_size_ret, event->get_queued_time()); - case PI_PROFILING_INFO_COMMAND_START: - return getInfo(param_value_size, param_value, - param_value_size_ret, event->get_start_time()); - case PI_PROFILING_INFO_COMMAND_END: - return getInfo(param_value_size, param_value, - param_value_size_ret, event->get_end_time()); - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - sycl::detail::pi::die("Event Profiling info request not implemented"); - return {}; -} - -pi_result hip_piEventSetCallback(pi_event event, - pi_int32 command_exec_callback_type, - pfn_notify notify, void *user_data) { - (void)event; - (void)command_exec_callback_type; - (void)notify; - (void)user_data; - - sycl::detail::pi::die("Event Callback not implemented in HIP backend"); - return PI_SUCCESS; -} - -pi_result hip_piEventSetStatus(pi_event event, pi_int32 execution_status) { - (void)event; - (void)execution_status; - - sycl::detail::pi::die("Event Set Status not implemented in HIP backend"); - return PI_ERROR_INVALID_VALUE; -} - -pi_result hip_piEventRetain(pi_event event) { - assert(event != nullptr); - - const auto refCount = event->increment_reference_count(); - - sycl::detail::pi::assertion( - refCount != 0, "Reference count overflow detected in hip_piEventRetain."); - - return PI_SUCCESS; -} - -pi_result hip_piEventRelease(pi_event event) { - assert(event != nullptr); - - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. - sycl::detail::pi::assertion( - event->get_reference_count() != 0, - "Reference count overflow detected in hip_piEventRelease."); - - // decrement ref count. If it is 0, delete the event. - if (event->decrement_reference_count() == 0) { - std::unique_ptr<_pi_event> event_ptr{event}; - pi_result result = PI_ERROR_INVALID_EVENT; - try { - ScopedContext active(event->get_context()); - result = event->release(); - } catch (...) { - result = PI_ERROR_OUT_OF_RESOURCES; - } - return result; - } - - return PI_SUCCESS; -} - /// Enqueues a wait on the given queue for all events. /// See \ref enqueueEventWait /// @@ -1424,7 +828,9 @@ pi_result hip_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 stream_token; _pi_stream_guard guard; hipStream_t hipStream = command_queue->get_next_compute_stream( - num_events_in_wait_list, event_wait_list, guard, &stream_token); + num_events_in_wait_list, + reinterpret_cast(event_wait_list), guard, + &stream_token); { std::lock_guard guard(command_queue->barrier_mutex_); if (command_queue->barrier_event_ == nullptr) { @@ -1443,16 +849,18 @@ pi_result hip_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, } }); } else { // wait just on given events - forLatestEvents(event_wait_list, num_events_in_wait_list, - [hipStream](pi_event event) -> pi_result { - if (event->get_queue()->has_been_synchronized( - event->get_compute_stream_token())) { - return PI_SUCCESS; - } else { - return PI_CHECK_ERROR( - hipStreamWaitEvent(hipStream, event->get(), 0)); - } - }); + forLatestEvents( + reinterpret_cast(event_wait_list), + num_events_in_wait_list, + [hipStream](ur_event_handle_t event) -> ur_result_t { + if (event->get_queue()->has_been_synchronized( + event->get_compute_stream_token())) { + return UR_RESULT_SUCCESS; + } else { + return UR_CHECK_ERROR( + hipStreamWaitEvent(hipStream, event->get(), 0)); + } + }); } result = PI_CHECK_ERROR( @@ -1485,40 +893,6 @@ pi_result hip_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, } } -/// Gets the native HIP handle of a PI event object -/// -/// \param[in] event The PI event to get the native HIP object of. -/// \param[out] nativeHandle Set to the native handle of the PI event object. -/// -/// \return PI_SUCCESS on success. PI_ERROR_INVALID_EVENT if given a user event. -pi_result hip_piextEventGetNativeHandle(pi_event event, - pi_native_handle *nativeHandle) { - *nativeHandle = reinterpret_cast(event->get()); - return PI_SUCCESS; -} - -/// Created a PI event object from a HIP event handle. -/// TODO: Implement this. -/// NOTE: The created PI object takes ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create PI event object from. -/// \param[out] event Set to the PI event object created from native handle. -/// -/// \return TBD -pi_result hip_piextEventCreateWithNativeHandle(pi_native_handle nativeHandle, - pi_context context, - bool ownNativeHandle, - pi_event *event) { - (void)nativeHandle; - (void)context; - (void)ownNativeHandle; - (void)event; - - sycl::detail::pi::die( - "Creation of PI event from native handle not implemented"); - return {}; -} - /// General 3D memory copy operation. /// This function requires the corresponding HIP context to be at the top of /// the context stack @@ -1612,7 +986,7 @@ pi_result hip_piEnqueueMemBufferReadRect( host_offset, host_row_pitch, host_slice_pitch); if (event) { - retErr = retImplEv->record(); + retErr = map_ur_error(retImplEv->record()); } if (blocking_read) { @@ -1662,7 +1036,7 @@ pi_result hip_piEnqueueMemBufferWriteRect( buffer_row_pitch, buffer_slice_pitch); if (event) { - retErr = retImplEv->record(); + retErr = map_ur_error(retImplEv->record()); } if (blocking_write) { @@ -1704,7 +1078,7 @@ pi_result hip_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream)); - result = retImplEv->start(); + result = map_ur_error(retImplEv->start()); } auto src = src_buffer->mem_.buffer_mem_.get_with_offset(src_offset); @@ -1713,7 +1087,7 @@ pi_result hip_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, result = PI_CHECK_ERROR(hipMemcpyDtoDAsync(dst, src, size, stream)); if (event) { - result = retImplEv->record(); + result = map_ur_error(retImplEv->record()); *event = retImplEv.release(); } @@ -1808,7 +1182,7 @@ pi_result hip_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream)); - result = retImplEv->start(); + result = map_ur_error(retImplEv->start()); } auto dstDevice = buffer->mem_.buffer_mem_.get_with_offset(offset); @@ -1870,7 +1244,7 @@ pi_result hip_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, } if (event) { - result = retImplEv->record(); + result = map_ur_error(retImplEv->record()); *event = retImplEv.release(); } @@ -2372,7 +1746,9 @@ pi_result hip_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, pi_uint32 stream_token; _pi_stream_guard guard; hipStream_t hipStream = queue->get_next_compute_stream( - num_events_in_waitlist, events_waitlist, guard, &stream_token); + num_events_in_waitlist, + reinterpret_cast(events_waitlist), guard, + &stream_token); result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist, events_waitlist); if (event) { @@ -2384,7 +1760,7 @@ pi_result hip_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, hipMemsetD8Async(reinterpret_cast(ptr), (unsigned char)value & 0xFF, count, hipStream)); if (event) { - result = event_ptr->record(); + result = map_ur_error(event_ptr->record()); *event = event_ptr.release(); } } catch (pi_result err) { @@ -2420,7 +1796,7 @@ pi_result hip_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, result = PI_CHECK_ERROR( hipMemcpyAsync(dst_ptr, src_ptr, size, hipMemcpyDefault, hipStream)); if (event) { - result = event_ptr->record(); + result = map_ur_error(event_ptr->record()); } if (blocking) { result = PI_CHECK_ERROR(hipStreamSynchronize(hipStream)); @@ -2461,7 +1837,7 @@ pi_result hip_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, result = PI_CHECK_ERROR(hipMemPrefetchAsync( ptr, size, queue->get_context()->get_device()->get(), hipStream)); if (event) { - result = event_ptr->record(); + result = map_ur_error(event_ptr->record()); *event = event_ptr.release(); } } catch (pi_result err) { @@ -2714,15 +2090,20 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextContextCreateWithNativeHandle, pi2ur::piextContextCreateWithNativeHandle) // Queue - _PI_CL(piQueueCreate, hip_piQueueCreate) - _PI_CL(piextQueueCreate, hip_piextQueueCreate) - _PI_CL(piQueueGetInfo, hip_piQueueGetInfo) - _PI_CL(piQueueFinish, hip_piQueueFinish) - _PI_CL(piQueueFlush, hip_piQueueFlush) - _PI_CL(piQueueRetain, hip_piQueueRetain) - _PI_CL(piQueueRelease, hip_piQueueRelease) - _PI_CL(piextQueueGetNativeHandle, hip_piextQueueGetNativeHandle) - _PI_CL(piextQueueCreateWithNativeHandle, hip_piextQueueCreateWithNativeHandle) + _PI_CL(piQueueCreate, pi2ur::piQueueCreate) + _PI_CL(piextQueueCreate, pi2ur::piextQueueCreate) + _PI_CL(piextQueueCreate2, pi2ur::piextQueueCreate2) + _PI_CL(piQueueGetInfo, pi2ur::piQueueGetInfo) + _PI_CL(piQueueFinish, pi2ur::piQueueFinish) + _PI_CL(piQueueFlush, pi2ur::piQueueFlush) + _PI_CL(piQueueRetain, pi2ur::piQueueRetain) + _PI_CL(piQueueRelease, pi2ur::piQueueRelease) + _PI_CL(piextQueueGetNativeHandle, pi2ur::piextQueueGetNativeHandle) + _PI_CL(piextQueueGetNativeHandle2, pi2ur::piextQueueGetNativeHandle2) + _PI_CL(piextQueueCreateWithNativeHandle, + pi2ur::piextQueueCreateWithNativeHandle) + _PI_CL(piextQueueCreateWithNativeHandle2, + pi2ur::piextQueueCreateWithNativeHandle2) // Memory _PI_CL(piMemBufferCreate, pi2ur::piMemBufferCreate) _PI_CL(piMemImageCreate, pi2ur::piMemImageCreate) @@ -2760,16 +2141,17 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piKernelSetExecInfo, pi2ur::piKernelSetExecInfo) _PI_CL(piextKernelSetArgPointer, pi2ur::piKernelSetArgPointer) // Event - _PI_CL(piEventCreate, hip_piEventCreate) - _PI_CL(piEventGetInfo, hip_piEventGetInfo) - _PI_CL(piEventGetProfilingInfo, hip_piEventGetProfilingInfo) - _PI_CL(piEventsWait, hip_piEventsWait) - _PI_CL(piEventSetCallback, hip_piEventSetCallback) - _PI_CL(piEventSetStatus, hip_piEventSetStatus) - _PI_CL(piEventRetain, hip_piEventRetain) - _PI_CL(piEventRelease, hip_piEventRelease) - _PI_CL(piextEventGetNativeHandle, hip_piextEventGetNativeHandle) - _PI_CL(piextEventCreateWithNativeHandle, hip_piextEventCreateWithNativeHandle) + _PI_CL(piEventCreate, pi2ur::piEventCreate) + _PI_CL(piEventGetInfo, pi2ur::piEventGetInfo) + _PI_CL(piEventGetProfilingInfo, pi2ur::piEventGetProfilingInfo) + _PI_CL(piEventsWait, pi2ur::piEventsWait) + _PI_CL(piEventSetCallback, pi2ur::piEventSetCallback) + _PI_CL(piEventSetStatus, pi2ur::piEventSetStatus) + _PI_CL(piEventRetain, pi2ur::piEventRetain) + _PI_CL(piEventRelease, pi2ur::piEventRelease) + _PI_CL(piextEventGetNativeHandle, pi2ur::piextEventGetNativeHandle) + _PI_CL(piextEventCreateWithNativeHandle, + pi2ur::piextEventCreateWithNativeHandle) // Sampler _PI_CL(piSamplerCreate, pi2ur::piSamplerCreate) _PI_CL(piSamplerGetInfo, pi2ur::piSamplerGetInfo) diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index f8a638327f103..b9b3255f21815 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -41,22 +41,16 @@ #include #include +#include #include #include #include -#include #include +#include +#include #include "pi2ur.hpp" -extern "C" { - -/// \cond INGORE_BLOCK_IN_DOXYGEN -pi_result hip_piQueueRelease(pi_queue); -pi_result hip_piQueueRetain(pi_queue); -/// \endcond -} - using _pi_stream_guard = std::unique_lock; /// A PI platform stores all known PI devices, @@ -127,355 +121,74 @@ struct _pi_mem : ur_mem_handle_t_ { using ur_mem_handle_t_::ur_mem_handle_t_; }; -/// PI queue mapping on to hipStream_t objects. -/// -struct _pi_queue { - using native_type = hipStream_t; - static constexpr int default_num_compute_streams = 64; - static constexpr int default_num_transfer_streams = 16; - - std::vector compute_streams_; - std::vector transfer_streams_; - // delay_compute_ keeps track of which streams have been recently reused and - // their next use should be delayed. If a stream has been recently reused it - // will be skipped the next time it would be selected round-robin style. When - // skipped, its delay flag is cleared. - std::vector delay_compute_; - // keep track of which streams have applied barrier - std::vector compute_applied_barrier_; - std::vector transfer_applied_barrier_; - _pi_context *context_; - _pi_device *device_; - pi_queue_properties properties_; - hipEvent_t barrier_event_ = nullptr; - hipEvent_t barrier_tmp_event_ = nullptr; - std::atomic_uint32_t refCount_; - std::atomic_uint32_t eventCount_; - std::atomic_uint32_t compute_stream_idx_; - std::atomic_uint32_t transfer_stream_idx_; - unsigned int num_compute_streams_; - unsigned int num_transfer_streams_; - unsigned int last_sync_compute_streams_; - unsigned int last_sync_transfer_streams_; - unsigned int flags_; - // When compute_stream_sync_mutex_ and compute_stream_mutex_ both need to be - // locked at the same time, compute_stream_sync_mutex_ should be locked first - // to avoid deadlocks - std::mutex compute_stream_sync_mutex_; - std::mutex compute_stream_mutex_; - std::mutex transfer_stream_mutex_; - std::mutex barrier_mutex_; - - _pi_queue(std::vector &&compute_streams, - std::vector &&transfer_streams, _pi_context *context, - _pi_device *device, pi_queue_properties properties, - unsigned int flags) - : compute_streams_{std::move(compute_streams)}, - transfer_streams_{std::move(transfer_streams)}, - delay_compute_(compute_streams_.size(), false), - compute_applied_barrier_(compute_streams_.size()), - transfer_applied_barrier_(transfer_streams_.size()), context_{context}, - device_{device}, properties_{properties}, refCount_{1}, eventCount_{0}, - compute_stream_idx_{0}, transfer_stream_idx_{0}, - num_compute_streams_{0}, num_transfer_streams_{0}, - last_sync_compute_streams_{0}, last_sync_transfer_streams_{0}, - flags_(flags) { - pi2ur::piContextRetain(context_); - pi2ur::piDeviceRetain(device_); - } - - ~_pi_queue() { - pi2ur::piContextRelease(context_); - pi2ur::piDeviceRelease(device_); - } - - void compute_stream_wait_for_barrier_if_needed(hipStream_t stream, - pi_uint32 stream_i); - void transfer_stream_wait_for_barrier_if_needed(hipStream_t stream, - pi_uint32 stream_i); - - // get_next_compute/transfer_stream() functions return streams from - // appropriate pools in round-robin fashion - native_type get_next_compute_stream(pi_uint32 *stream_token = nullptr); - // this overload tries select a stream that was used by one of dependancies. - // If that is not possible returns a new stream. If a stream is reused it - // returns a lock that needs to remain locked as long as the stream is in use - native_type get_next_compute_stream(pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - _pi_stream_guard &guard, - pi_uint32 *stream_token = nullptr); - native_type get_next_transfer_stream(); - native_type get() { return get_next_compute_stream(); }; - - bool has_been_synchronized(pi_uint32 stream_token) { - // stream token not associated with one of the compute streams - if (stream_token == std::numeric_limits::max()) { - return false; - } - return last_sync_compute_streams_ > stream_token; - } - - bool can_reuse_stream(pi_uint32 stream_token) { - // stream token not associated with one of the compute streams - if (stream_token == std::numeric_limits::max()) { - return false; - } - // If the command represented by the stream token was not the last command - // enqueued to the stream we can not reuse the stream - we need to allow for - // commands enqueued after it and the one we are about to enqueue to run - // concurrently - bool is_last_command = - (compute_stream_idx_ - stream_token) <= compute_streams_.size(); - // If there was a barrier enqueued to the queue after the command - // represented by the stream token we should not reuse the stream, as we can - // not take that stream into account for the bookkeeping for the next - // barrier - such a stream would not be synchronized with. Performance-wise - // it does not matter that we do not reuse the stream, as the work - // represented by the stream token is guaranteed to be complete by the - // barrier before any work we are about to enqueue to the stream will start, - // so the event does not need to be synchronized with. - return is_last_command && !has_been_synchronized(stream_token); - } - - template bool all_of(T &&f) { - { - std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int end = - std::min(static_cast(compute_streams_.size()), - num_compute_streams_); - if (!std::all_of(compute_streams_.begin(), compute_streams_.begin() + end, - f)) - return false; - } - { - std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int end = - std::min(static_cast(transfer_streams_.size()), - num_transfer_streams_); - if (!std::all_of(transfer_streams_.begin(), - transfer_streams_.begin() + end, f)) - return false; - } - return true; - } - - template void for_each_stream(T &&f) { - { - std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int end = - std::min(static_cast(compute_streams_.size()), - num_compute_streams_); - for (unsigned int i = 0; i < end; i++) { - f(compute_streams_[i]); - } - } - { - std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int end = - std::min(static_cast(transfer_streams_.size()), - num_transfer_streams_); - for (unsigned int i = 0; i < end; i++) { - f(transfer_streams_[i]); - } - } - } - - template void sync_streams(T &&f) { - auto sync_compute = [&f, &streams = compute_streams_, - &delay = delay_compute_](unsigned int start, - unsigned int stop) { - for (unsigned int i = start; i < stop; i++) { - f(streams[i]); - delay[i] = false; - } - }; - auto sync_transfer = [&f, &streams = transfer_streams_](unsigned int start, - unsigned int stop) { - for (unsigned int i = start; i < stop; i++) { - f(streams[i]); - } - }; - { - unsigned int size = static_cast(compute_streams_.size()); - std::lock_guard compute_sync_guard( - compute_stream_sync_mutex_); - std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int start = last_sync_compute_streams_; - unsigned int end = num_compute_streams_ < size - ? num_compute_streams_ - : compute_stream_idx_.load(); - if (end - start >= size) { - sync_compute(0, size); - } else { - start %= size; - end %= size; - if (start < end) { - sync_compute(start, end); - } else { - sync_compute(start, size); - sync_compute(0, end); - } - } - if (ResetUsed) { - last_sync_compute_streams_ = end; - } - } - { - unsigned int size = static_cast(transfer_streams_.size()); - if (size > 0) { - std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int start = last_sync_transfer_streams_; - unsigned int end = num_transfer_streams_ < size - ? num_transfer_streams_ - : transfer_stream_idx_.load(); - if (end - start >= size) { - sync_transfer(0, size); - } else { - start %= size; - end %= size; - if (start < end) { - sync_transfer(start, end); - } else { - sync_transfer(start, size); - sync_transfer(0, end); - } - } - if (ResetUsed) { - last_sync_transfer_streams_ = end; - } - } - } - } - - _pi_context *get_context() const { return context_; }; - - _pi_device *get_device() const { return device_; }; - - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } - - pi_uint32 get_next_event_id() noexcept { return ++eventCount_; } +struct _pi_queue : ur_queue_handle_t_ { + using ur_queue_handle_t_::ur_queue_handle_t_; }; typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus, void *userData); -/// PI Event mapping to hipEvent_t -/// -struct _pi_event { -public: - using native_type = hipEvent_t; - - pi_result record(); - pi_result wait(); +struct _pi_event : ur_event_handle_t_ { + using ur_event_handle_t_::ur_event_handle_t_; - pi_result start(); - - native_type get() const noexcept { return evEnd_; }; - - pi_queue get_queue() const noexcept { return queue_; } - - hipStream_t get_stream() const noexcept { return stream_; } - - pi_uint32 get_compute_stream_token() const noexcept { return streamToken_; } - - pi_command_type get_command_type() const noexcept { return commandType_; } - - pi_uint32 get_reference_count() const noexcept { return refCount_; } - - bool is_recorded() const noexcept { return isRecorded_; } - - bool is_started() const noexcept { return isStarted_; } - - bool is_completed() const noexcept; - - pi_int32 get_execution_status() const noexcept { - - if (!is_recorded()) { - return PI_EVENT_SUBMITTED; - } - - if (!is_completed()) { - return PI_EVENT_RUNNING; - } - return PI_EVENT_COMPLETE; - } - - pi_context get_context() const noexcept { return context_; }; - - pi_uint32 increment_reference_count() { return ++refCount_; } - - pi_uint32 decrement_reference_count() { return --refCount_; } - - pi_uint32 get_event_id() const noexcept { return eventId_; } - - // Returns the counter time when the associated command(s) were enqueued - // - pi_uint64 get_queued_time() const; - - // Returns the counter time when the associated command(s) started execution - // - pi_uint64 get_start_time() const; - - // Returns the counter time when the associated command(s) completed - // - pi_uint64 get_end_time() const; - - // construct a native HIP. This maps closely to the underlying HIP event. + // Helpers for queue command implementations until they also get ported to UR static pi_event make_native(pi_command_type type, pi_queue queue, hipStream_t stream, - pi_uint32 stream_token = std::numeric_limits::max()) { - return new _pi_event(type, queue->get_context(), queue, stream, - stream_token); - } - - pi_result release(); - - ~_pi_event(); - -private: - // This constructor is private to force programmers to use the make_native / - // make_user static members in order to create a pi_event for HIP. - _pi_event(pi_command_type type, pi_context context, pi_queue queue, - hipStream_t stream, pi_uint32 stream_token); - - pi_command_type commandType_; // The type of command associated with event. - - std::atomic_uint32_t refCount_; // Event reference count. - - bool hasBeenWaitedOn_; // Signifies whether the event has been waited - // on through a call to wait(), which implies - // that it has completed. - - bool isRecorded_; // Signifies wether a native HIP event has been recorded - // yet. - bool isStarted_; // Signifies wether the operation associated with the - // PI event has started or not - // - - pi_uint32 streamToken_; - pi_uint32 eventId_; // Queue identifier of the event. - - native_type evEnd_; // HIP event handle. If this _pi_event represents a user - // event, this will be nullptr. - - native_type evStart_; // HIP event handle associated with the start - - native_type evQueued_; // HIP event handle associated with the time - // the command was enqueued - - pi_queue queue_; // pi_queue associated with the event. If this is a user - // event, this will be nullptr. - - hipStream_t stream_; // hipStream_t associated with the event. If this is a - // user event, this will be uninitialized. + uint32_t stream_token = std::numeric_limits::max()) { + auto urQueue = reinterpret_cast(queue); + static std::unordered_map<_pi_command_type, ur_command_t> cmdMap = { + {PI_COMMAND_TYPE_NDRANGE_KERNEL, UR_COMMAND_KERNEL_LAUNCH}, + {PI_COMMAND_TYPE_MEM_BUFFER_READ, UR_COMMAND_MEM_BUFFER_READ}, + {PI_COMMAND_TYPE_MEM_BUFFER_WRITE, UR_COMMAND_MEM_BUFFER_WRITE}, + {PI_COMMAND_TYPE_MEM_BUFFER_COPY, UR_COMMAND_MEM_BUFFER_COPY}, + {PI_COMMAND_TYPE_MEM_BUFFER_MAP, UR_COMMAND_MEM_BUFFER_MAP}, + {PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, UR_COMMAND_MEM_UNMAP}, + {PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, UR_COMMAND_MEM_BUFFER_READ_RECT}, + {PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, + UR_COMMAND_MEM_BUFFER_WRITE_RECT}, + {PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, UR_COMMAND_MEM_BUFFER_COPY_RECT}, + {PI_COMMAND_TYPE_MEM_BUFFER_FILL, UR_COMMAND_MEM_BUFFER_FILL}, + {PI_COMMAND_TYPE_IMAGE_READ, UR_COMMAND_MEM_IMAGE_READ}, + {PI_COMMAND_TYPE_IMAGE_WRITE, UR_COMMAND_MEM_IMAGE_WRITE}, + {PI_COMMAND_TYPE_IMAGE_COPY, UR_COMMAND_MEM_IMAGE_COPY}, + {PI_COMMAND_TYPE_BARRIER, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER}, + {PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_READ, + UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ}, + {PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_WRITE, + UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE}, + }; - pi_context context_; // pi_context associated with the event. If this is a - // native event, this will be the same context associated - // with the queue_ member. + // TODO(ur): There is no exact mapping for the following commands. Just + // default to KERNEL_LAUNCH for now. + // PI_COMMAND_TYPE_USER + // PI_COMMAND_TYPE_MEM_BUFFER_FILL, + // PI_COMMAND_TYPE_IMAGE_READ, + // PI_COMMAND_TYPE_IMAGE_WRITE, + // PI_COMMAND_TYPE_IMAGE_COPY, + // PI_COMMAND_TYPE_NATIVE_KERNEL, + // PI_COMMAND_TYPE_COPY_BUFFER_TO_IMAGE, + // PI_COMMAND_TYPE_COPY_IMAGE_TO_BUFFER, + // PI_COMMAND_TYPE_MAP_IMAGE, + // PI_COMMAND_TYPE_MARKER, + // PI_COMMAND_TYPE_ACQUIRE_GL_OBJECTS, + // PI_COMMAND_TYPE_RELEASE_GL_OBJECTS, + // PI_COMMAND_TYPE_BARRIER, + // PI_COMMAND_TYPE_MIGRATE_MEM_OBJECTS, + // PI_COMMAND_TYPE_FILL_IMAGE + // PI_COMMAND_TYPE_SVM_FREE + // PI_COMMAND_TYPE_SVM_MEMCPY + // PI_COMMAND_TYPE_SVM_MEMFILL + // PI_COMMAND_TYPE_SVM_MAP + // PI_COMMAND_TYPE_SVM_UNMAP + + ur_command_t urCmd = UR_COMMAND_KERNEL_LAUNCH; + auto cmdIt = cmdMap.find(type); + if (cmdIt != cmdMap.end()) { + urCmd = cmdIt->second; + } + return reinterpret_cast( + ur_event_handle_t_::make_native(urCmd, urQueue, stream, stream_token)); + } }; /// Implementation of PI Program on HIP Module object diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 520e2bd2f86af..c673a249dfb78 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -183,6 +183,8 @@ if ("hip" IN_LIST SYCL_ENABLE_PLUGINS) "ur/adapters/hip/context.hpp" "ur/adapters/hip/device.cpp" "ur/adapters/hip/device.hpp" + "ur/adapters/hip/event.cpp" + "ur/adapters/hip/event.hpp" "ur/adapters/hip/platform.cpp" "ur/adapters/hip/platform.hpp" "ur/adapters/hip/memory.cpp" @@ -194,6 +196,8 @@ if ("hip" IN_LIST SYCL_ENABLE_PLUGINS) "ur/adapters/hip/program.hpp" "ur/adapters/hip/kernel.cpp" "ur/adapters/hip/kernel.hpp" + "ur/adapters/hip/queue.cpp" + "ur/adapters/hip/queue.hpp" "ur/adapters/hip/ur_interface_loader.cpp" INCLUDE_DIRS ${sycl_inc_dir} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp index 3037f2943dc8a..05f246ef7dc1a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp @@ -11,10 +11,6 @@ #include "device.hpp" #include "platform.hpp" -// We need this declaration temporarily while UR and PI share ScopedContext -class _pi_context; -using pi_context = _pi_context *; - typedef void (*ur_context_extended_deleter_t)(void *user_data); struct ur_context_handle_t_ { @@ -84,10 +80,6 @@ class ScopedContext { bool needToRecover_; public: - // TODO(ur): Needed for compatibility with PI; once the HIP PI plugin is - // fully moved over we can drop this constructor - ScopedContext(pi_context ctxt); - ScopedContext(ur_context_handle_t ctxt) : placedContext_{ctxt}, needToRecover_{false} { @@ -119,4 +111,4 @@ class ScopedContext { } } }; -} // namespace \ No newline at end of file +} // namespace diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp new file mode 100644 index 0000000000000..8267ef36f54df --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp @@ -0,0 +1,331 @@ +//===--------- event.cpp - HIP Adapter ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "event.hpp" +#include "common.hpp" +#include "context.hpp" +#include "platform.hpp" + +ur_event_handle_t_::ur_event_handle_t_(ur_command_t type, + ur_context_handle_t context, + ur_queue_handle_t queue, + hipStream_t stream, + uint32_t stream_token) + : commandType_{type}, refCount_{1}, hasBeenWaitedOn_{false}, + isRecorded_{false}, isStarted_{false}, + streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr}, + evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} { + + bool profilingEnabled = queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE; + + UR_CHECK_ERROR(hipEventCreateWithFlags( + &evEnd_, profilingEnabled ? hipEventDefault : hipEventDisableTiming)); + + if (profilingEnabled) { + UR_CHECK_ERROR(hipEventCreateWithFlags(&evQueued_, hipEventDefault)); + UR_CHECK_ERROR(hipEventCreateWithFlags(&evStart_, hipEventDefault)); + } + + if (queue_ != nullptr) { + urQueueRetain(queue_); + } + urContextRetain(context_); +} + +ur_event_handle_t_::~ur_event_handle_t_() { + if (queue_ != nullptr) { + urQueueRelease(queue_); + } + urContextRelease(context_); +} + +ur_result_t ur_event_handle_t_::start() { + assert(!is_started()); + ur_result_t result = UR_RESULT_SUCCESS; + + try { + if (queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE) { + // NOTE: This relies on the default stream to be unused. + UR_CHECK_ERROR(hipEventRecord(evQueued_, 0)); + UR_CHECK_ERROR(hipEventRecord(evStart_, queue_->get())); + } + } catch (ur_result_t error) { + result = error; + } + + isStarted_ = true; + return result; +} + +bool ur_event_handle_t_::is_completed() const noexcept { + if (!isRecorded_) { + return false; + } + if (!hasBeenWaitedOn_) { + const hipError_t ret = hipEventQuery(evEnd_); + if (ret != hipSuccess && ret != hipErrorNotReady) { + UR_CHECK_ERROR(ret); + return false; + } + if (ret == hipErrorNotReady) { + return false; + } + } + return true; +} + +uint64_t ur_event_handle_t_::get_queued_time() const { + float miliSeconds = 0.0f; + assert(is_started()); + + UR_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, evStart_, evEnd_)); + return static_cast(miliSeconds * 1.0e6); +} + +uint64_t ur_event_handle_t_::get_start_time() const { + float miliSeconds = 0.0f; + assert(is_started()); + + UR_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, + ur_platform_handle_t_::evBase_, evStart_)); + return static_cast(miliSeconds * 1.0e6); +} + +uint64_t ur_event_handle_t_::get_end_time() const { + float miliSeconds = 0.0f; + assert(is_started() && is_recorded()); + + UR_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, + ur_platform_handle_t_::evBase_, evEnd_)); + return static_cast(miliSeconds * 1.0e6); +} + +ur_result_t ur_event_handle_t_::record() { + + if (is_recorded() || !is_started()) { + return UR_RESULT_ERROR_INVALID_EVENT; + } + + ur_result_t result = UR_RESULT_ERROR_INVALID_OPERATION; + + UR_ASSERT(queue_, UR_RESULT_ERROR_INVALID_QUEUE); + + try { + eventId_ = queue_->get_next_event_id(); + if (eventId_ == 0) { + sycl::detail::ur::die( + "Unrecoverable program state reached in event identifier overflow"); + } + result = UR_CHECK_ERROR(hipEventRecord(evEnd_, stream_)); + } catch (ur_result_t error) { + result = error; + } + + if (result == UR_RESULT_SUCCESS) { + isRecorded_ = true; + } + + return result; +} + +ur_result_t ur_event_handle_t_::wait() { + ur_result_t retErr; + try { + retErr = UR_CHECK_ERROR(hipEventSynchronize(evEnd_)); + hasBeenWaitedOn_ = true; + } catch (ur_result_t error) { + retErr = error; + } + + return retErr; +} + +ur_result_t ur_event_handle_t_::release() { + assert(queue_ != nullptr); + UR_CHECK_ERROR(hipEventDestroy(evEnd_)); + + if (queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE) { + UR_CHECK_ERROR(hipEventDestroy(evQueued_)); + UR_CHECK_ERROR(hipEventDestroy(evStart_)); + } + + return UR_RESULT_SUCCESS; +} + +//////////////////// + +UR_APIEXPORT ur_result_t UR_APICALL +urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { + UR_ASSERT(numEvents > 0, UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(phEventWaitList, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + try { + + auto context = phEventWaitList[0]->get_context(); + ScopedContext active(context); + + auto waitFunc = [context](ur_event_handle_t event) -> ur_result_t { + UR_ASSERT(event, UR_RESULT_ERROR_INVALID_EVENT); + UR_ASSERT(event->get_context() == context, + UR_RESULT_ERROR_INVALID_CONTEXT); + + return event->wait(); + }; + return forLatestEvents(phEventWaitList, numEvents, waitFunc); + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +// +// Events +// + +UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, + ur_event_info_t propName, + size_t propValueSize, + void *pPropValue, + size_t *pPropValueSizeRet) { + UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_SIZE); + + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); + switch (propName) { + case UR_EVENT_INFO_COMMAND_QUEUE: + return ReturnValue(hEvent->get_queue()); + case UR_EVENT_INFO_COMMAND_TYPE: + return ReturnValue(hEvent->get_command_type()); + case UR_EVENT_INFO_REFERENCE_COUNT: + return ReturnValue(hEvent->get_reference_count()); + case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS: + return ReturnValue(hEvent->get_execution_status()); + case UR_EVENT_INFO_CONTEXT: + return ReturnValue(hEvent->get_context()); + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +/// Obtain profiling information from UR HIP events +/// Timings from HIP are only elapsed time. +UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( + ur_event_handle_t hEvent, ur_profiling_info_t propName, + size_t propValueSize, void *pPropValue, size_t *pPropValueSizeRet) { + + UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_VALUE); + + ur_queue_handle_t queue = hEvent->get_queue(); + if (queue == nullptr || + !(queue->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; + } + + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); + switch (propName) { + case UR_PROFILING_INFO_COMMAND_QUEUED: + case UR_PROFILING_INFO_COMMAND_SUBMIT: + // Note: No user for this case + return ReturnValue(static_cast(hEvent->get_queued_time())); + case UR_PROFILING_INFO_COMMAND_START: + return ReturnValue(static_cast(hEvent->get_start_time())); + case UR_PROFILING_INFO_COMMAND_END: + return ReturnValue(static_cast(hEvent->get_end_time())); + default: + break; + } + return {}; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus, + ur_event_callback_t pfnNotify, void *pUserData) { + std::ignore = hEvent; + std::ignore = execStatus; + std::ignore = pfnNotify; + std::ignore = pUserData; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { + UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + const auto refCount = hEvent->increment_reference_count(); + + sycl::detail::ur::assertion( + refCount != 0, "Reference count overflow detected in urEventRetain."); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { + UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + sycl::detail::ur::assertion( + hEvent->get_reference_count() != 0, + "Reference count overflow detected in urEventRelease."); + + // decrement ref count. If it is 0, delete the event. + if (hEvent->decrement_reference_count() == 0) { + std::unique_ptr event_ptr{hEvent}; + ur_result_t result = UR_RESULT_ERROR_INVALID_EVENT; + try { + ScopedContext active(hEvent->get_context()); + result = hEvent->release(); + } catch (...) { + result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + return result; + } + + return UR_RESULT_SUCCESS; +} + +/// Gets the native HIP handle of a UR event object +/// +/// \param[in] event The UR event to get the native HIP object of. +/// \param[out] nativeHandle Set to the native handle of the UR event object. +/// +/// \return UR_RESULT_SUCCESS on success. UR_RESULT_ERROR_INVALID_EVENT if given +/// a user event. +UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( + ur_event_handle_t hEvent, ur_native_handle_t *phNativeEvent) { + UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phNativeEvent, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + *phNativeEvent = reinterpret_cast(hEvent->get()); + return UR_RESULT_SUCCESS; +} + +/// Created a UR event object from a HIP event handle. +/// TODO: Implement this. +/// NOTE: The created UR object takes ownership of the native handle. +/// +/// \param[in] nativeHandle The native handle to create UR event object from. +/// \param[out] event Set to the UR event object created from native handle. +/// +/// \return TBD +UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( + ur_native_handle_t hNativeEvent, ur_context_handle_t hContext, + const ur_event_native_properties_t *pProperties, + ur_event_handle_t *phEvent) { + + std::ignore = hNativeEvent; + std::ignore = hContext; + std::ignore = pProperties; + std::ignore = phEvent; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp new file mode 100644 index 0000000000000..3c9700419cd8b --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp @@ -0,0 +1,177 @@ +//===--------- event.hpp - HIP Adapter -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include "common.hpp" +#include "queue.hpp" + +/// UR Event mapping to hipEvent_t +/// +struct ur_event_handle_t_ { +public: + using native_type = hipEvent_t; + + ur_result_t record(); + + ur_result_t wait(); + + ur_result_t start(); + + native_type get() const noexcept { return evEnd_; }; + + ur_queue_handle_t get_queue() const noexcept { return queue_; } + + hipStream_t get_stream() const noexcept { return stream_; } + + uint32_t get_compute_stream_token() const noexcept { return streamToken_; } + + ur_command_t get_command_type() const noexcept { return commandType_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } + + bool is_recorded() const noexcept { return isRecorded_; } + + bool is_started() const noexcept { return isStarted_; } + + bool is_completed() const noexcept; + + uint32_t get_execution_status() const noexcept { + + if (!is_recorded()) { + return UR_EVENT_STATUS_SUBMITTED; + } + + if (!is_completed()) { + return UR_EVENT_STATUS_RUNNING; + } + return UR_EVENT_STATUS_COMPLETE; + } + + ur_context_handle_t get_context() const noexcept { return context_; }; + + uint32_t increment_reference_count() { return ++refCount_; } + + uint32_t decrement_reference_count() { return --refCount_; } + + uint32_t get_event_id() const noexcept { return eventId_; } + + // Returns the counter time when the associated command(s) were enqueued + // + uint64_t get_queued_time() const; + + // Returns the counter time when the associated command(s) started execution + // + uint64_t get_start_time() const; + + // Returns the counter time when the associated command(s) completed + // + uint64_t get_end_time() const; + + // construct a native HIP. This maps closely to the underlying HIP event. + static ur_event_handle_t + make_native(ur_command_t type, ur_queue_handle_t queue, hipStream_t stream, + uint32_t stream_token = std::numeric_limits::max()) { + return new ur_event_handle_t_(type, queue->get_context(), queue, stream, + stream_token); + } + + ur_result_t release(); + + ~ur_event_handle_t_(); + +private: + // This constructor is private to force programmers to use the make_native / + // make_user static members in order to create a ur_event_handle_t for HIP. + ur_event_handle_t_(ur_command_t type, ur_context_handle_t context, + ur_queue_handle_t queue, hipStream_t stream, + uint32_t stream_token); + + ur_command_t commandType_; // The type of command associated with event. + + std::atomic_uint32_t refCount_; // Event reference count. + + bool hasBeenWaitedOn_; // Signifies whether the event has been waited + // on through a call to wait(), which implies + // that it has completed. + + bool isRecorded_; // Signifies wether a native HIP event has been recorded + // yet. + bool isStarted_; // Signifies wether the operation associated with the + // UR event has started or not + // + + uint32_t streamToken_; + uint32_t eventId_; // Queue identifier of the event. + + native_type evEnd_; // HIP event handle. If this ur_event_handle_t_ represents + // a user event, this will be nullptr. + + native_type evStart_; // HIP event handle associated with the start + + native_type evQueued_; // HIP event handle associated with the time + // the command was enqueued + + ur_queue_handle_t queue_; // ur_queue_handle_t associated with the event. If + // this is a user event, this will be nullptr. + + hipStream_t stream_; // hipStream_t associated with the event. If this is a + // user event, this will be uninitialized. + + ur_context_handle_t + context_; // ur_context_handle_t associated with the event. If this + // is a native event, this will be the same + // context associated with the queue_ member. +}; + +// Iterates over the event wait list, returns correct ur_result_t error codes. +// Invokes the callback for the latest event of each queue in the wait list. +// The callback must take a single ur_event_handle_t argument and return a +// ur_result_t. +template +ur_result_t forLatestEvents(const ur_event_handle_t *event_wait_list, + size_t num_events_in_wait_list, Func &&f) { + + if (event_wait_list == nullptr || num_events_in_wait_list == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + // Fast path if we only have a single event + if (num_events_in_wait_list == 1) { + return f(event_wait_list[0]); + } + + std::vector events{ + event_wait_list, event_wait_list + num_events_in_wait_list}; + std::sort(events.begin(), events.end(), + [](ur_event_handle_t e0, ur_event_handle_t e1) { + // Tiered sort creating sublists of streams (smallest value first) + // in which the corresponding events are sorted into a sequence of + // newest first. + return e0->get_stream() < e1->get_stream() || + (e0->get_stream() == e1->get_stream() && + e0->get_event_id() > e1->get_event_id()); + }); + + bool first = true; + hipStream_t lastSeenStream = 0; + for (ur_event_handle_t event : events) { + if (!event || (!first && event->get_stream() == lastSeenStream)) { + continue; + } + + first = false; + lastSeenStream = event->get_stream(); + + auto result = f(event); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp new file mode 100644 index 0000000000000..272e8b2a29ea7 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp @@ -0,0 +1,300 @@ +//===--------- queue.cpp - HIP Adapter ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "queue.hpp" +#include "context.hpp" +#include "event.hpp" + +void ur_queue_handle_t_::compute_stream_wait_for_barrier_if_needed( + hipStream_t stream, uint32_t stream_i) { + if (barrier_event_ && !compute_applied_barrier_[stream_i]) { + UR_CHECK_ERROR(hipStreamWaitEvent(stream, barrier_event_, 0)); + compute_applied_barrier_[stream_i] = true; + } +} + +void ur_queue_handle_t_::transfer_stream_wait_for_barrier_if_needed( + hipStream_t stream, uint32_t stream_i) { + if (barrier_event_ && !transfer_applied_barrier_[stream_i]) { + UR_CHECK_ERROR(hipStreamWaitEvent(stream, barrier_event_, 0)); + transfer_applied_barrier_[stream_i] = true; + } +} + +hipStream_t +ur_queue_handle_t_::get_next_compute_stream(uint32_t *stream_token) { + uint32_t stream_i; + uint32_t token; + while (true) { + if (num_compute_streams_ < compute_streams_.size()) { + // the check above is for performance - so as not to lock mutex every time + std::lock_guard guard(compute_stream_mutex_); + // The second check is done after mutex is locked so other threads can not + // change num_compute_streams_ after that + if (num_compute_streams_ < compute_streams_.size()) { + UR_CHECK_ERROR(hipStreamCreateWithFlags( + &compute_streams_[num_compute_streams_++], flags_)); + } + } + token = compute_stream_idx_++; + stream_i = token % compute_streams_.size(); + // if a stream has been reused before it was next selected round-robin + // fashion, we want to delay its next use and instead select another one + // that is more likely to have completed all the enqueued work. + if (delay_compute_[stream_i]) { + delay_compute_[stream_i] = false; + } else { + break; + } + } + if (stream_token) { + *stream_token = token; + } + hipStream_t res = compute_streams_[stream_i]; + compute_stream_wait_for_barrier_if_needed(res, stream_i); + return res; +} + +hipStream_t ur_queue_handle_t_::get_next_compute_stream( + uint32_t num_events_in_wait_list, const ur_event_handle_t *event_wait_list, + ur_stream_quard &guard, uint32_t *stream_token) { + for (uint32_t i = 0; i < num_events_in_wait_list; i++) { + uint32_t token = event_wait_list[i]->get_compute_stream_token(); + if (event_wait_list[i]->get_queue() == this && can_reuse_stream(token)) { + std::unique_lock compute_sync_guard( + compute_stream_sync_mutex_); + // redo the check after lock to avoid data races on + // last_sync_compute_streams_ + if (can_reuse_stream(token)) { + uint32_t stream_i = token % delay_compute_.size(); + delay_compute_[stream_i] = true; + if (stream_token) { + *stream_token = token; + } + guard = ur_stream_quard{std::move(compute_sync_guard)}; + hipStream_t res = event_wait_list[i]->get_stream(); + compute_stream_wait_for_barrier_if_needed(res, stream_i); + return res; + } + } + } + guard = {}; + return get_next_compute_stream(stream_token); +} + +hipStream_t ur_queue_handle_t_::get_next_transfer_stream() { + if (transfer_streams_.empty()) { // for example in in-order queue + return get_next_compute_stream(); + } + if (num_transfer_streams_ < transfer_streams_.size()) { + // the check above is for performance - so as not to lock mutex every time + std::lock_guard guard(transfer_stream_mutex_); + // The second check is done after mutex is locked so other threads can not + // change num_transfer_streams_ after that + if (num_transfer_streams_ < transfer_streams_.size()) { + UR_CHECK_ERROR(hipStreamCreateWithFlags( + &transfer_streams_[num_transfer_streams_++], flags_)); + } + } + uint32_t stream_i = transfer_stream_idx_++ % transfer_streams_.size(); + hipStream_t res = transfer_streams_[stream_i]; + transfer_stream_wait_for_barrier_if_needed(res, stream_i); + return res; +} + +/////////////////////////////// + +UR_APIEXPORT ur_result_t UR_APICALL +urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + try { + std::unique_ptr queueImpl{nullptr}; + + if (hContext->get_device() != hDevice) { + *phQueue = nullptr; + return UR_RESULT_ERROR_INVALID_DEVICE; + } + + unsigned int flags = 0; + + const bool is_out_of_order = + pProps->flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; + + std::vector computeHipStreams( + is_out_of_order ? ur_queue_handle_t_::default_num_compute_streams : 1); + std::vector transferHipStreams( + is_out_of_order ? ur_queue_handle_t_::default_num_transfer_streams : 0); + + queueImpl = std::unique_ptr(new ur_queue_handle_t_{ + std::move(computeHipStreams), std::move(transferHipStreams), hContext, + hDevice, flags, pProps->flags}); + + *phQueue = queueImpl.release(); + + return UR_RESULT_SUCCESS; + } catch (ur_result_t err) { + + return err; + + } catch (...) { + + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, + ur_queue_info_t propName, + size_t propValueSize, + void *pPropValue, + size_t *pPropSizeRet) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); + switch (propName) { + case UR_QUEUE_INFO_CONTEXT: + return ReturnValue(hQueue->context_); + case UR_QUEUE_INFO_DEVICE: + return ReturnValue(hQueue->device_); + case UR_QUEUE_INFO_REFERENCE_COUNT: + return ReturnValue(hQueue->get_reference_count()); + case UR_QUEUE_INFO_FLAGS: + return ReturnValue(hQueue->ur_flags_); + case UR_QUEUE_INFO_EMPTY: { + bool IsReady = hQueue->all_of([](hipStream_t s) -> bool { + const hipError_t ret = hipStreamQuery(s); + if (ret == hipSuccess) + return true; + + try { + UR_CHECK_ERROR(ret); + } catch (...) { + return false; + } + + return false; + }); + return ReturnValue(IsReady); + } + default: + break; + } + return {}; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hQueue->get_reference_count() > 0, UR_RESULT_ERROR_INVALID_QUEUE); + + hQueue->increment_reference_count(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + if (hQueue->decrement_reference_count() > 0) { + return UR_RESULT_SUCCESS; + } + + try { + std::unique_ptr queueImpl(hQueue); + + ScopedContext active(hQueue->get_context()); + + hQueue->for_each_stream([](hipStream_t s) { + UR_CHECK_ERROR(hipStreamSynchronize(s)); + UR_CHECK_ERROR(hipStreamDestroy(s)); + }); + + return UR_RESULT_SUCCESS; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + // set default result to a negative result (avoid false-positve tests) + ur_result_t result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + + try { + + ScopedContext active(hQueue->get_context()); + + hQueue->sync_streams([&result](hipStream_t s) { + result = UR_CHECK_ERROR(hipStreamSynchronize(s)); + }); + + } catch (ur_result_t err) { + + result = err; + + } catch (...) { + + result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + return result; +} + +// There is no HIP counterpart for queue flushing and we don't run into the +// same problem of having to flush cross-queue dependencies as some of the +// other plugins, so it can be left as no-op. +UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { + std::ignore = hQueue; + return UR_RESULT_SUCCESS; +} + +/// Gets the native HIP handle of a UR queue object +/// +/// \param[in] queue The UR queue to get the native HIP object of. +/// \param[out] nativeHandle Set to the native handle of the UR queue object. +/// +/// \return UR_RESULT_SUCCESS +UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( + ur_queue_handle_t hQueue, ur_native_handle_t *phNativeQueue) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + ScopedContext active(hQueue->get_context()); + *phNativeQueue = + reinterpret_cast(hQueue->get_next_compute_stream()); + return UR_RESULT_SUCCESS; +} + +/// Created a UR queue object from a HIP queue handle. +/// TODO: Implement this. +/// NOTE: The created UR object takes ownership of the native handle. +/// +/// \param[in] nativeHandle The native handle to create UR queue object from. +/// \param[in] context is the UR context of the queue. +/// \param[out] queue Set to the UR queue object created from native handle. +/// \param ownNativeHandle tells if SYCL RT should assume the ownership of +/// the native handle, if it can. +/// +/// +/// \return TBD +UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( + ur_native_handle_t hNativeQueue, ur_context_handle_t hContext, + ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties, + ur_queue_handle_t *phQueue) { + + std::ignore = hNativeQueue; + std::ignore = hContext; + std::ignore = hDevice; + std::ignore = pProperties; + std::ignore = phQueue; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp new file mode 100644 index 0000000000000..f391f1cc82a7c --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp @@ -0,0 +1,243 @@ +//===--------- queue.hpp - HIP Adapter -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include "common.hpp" + +using ur_stream_quard = std::unique_lock; + +/// UR queue mapping on to hipStream_t objects. +/// +struct ur_queue_handle_t_ { + using native_type = hipStream_t; + static constexpr int default_num_compute_streams = 64; + static constexpr int default_num_transfer_streams = 16; + + std::vector compute_streams_; + std::vector transfer_streams_; + // delay_compute_ keeps track of which streams have been recently reused and + // their next use should be delayed. If a stream has been recently reused it + // will be skipped the next time it would be selected round-robin style. When + // skipped, its delay flag is cleared. + std::vector delay_compute_; + // keep track of which streams have applied barrier + std::vector compute_applied_barrier_; + std::vector transfer_applied_barrier_; + ur_context_handle_t context_; + ur_device_handle_t device_; + hipEvent_t barrier_event_ = nullptr; + hipEvent_t barrier_tmp_event_ = nullptr; + std::atomic_uint32_t refCount_; + std::atomic_uint32_t eventCount_; + std::atomic_uint32_t compute_stream_idx_; + std::atomic_uint32_t transfer_stream_idx_; + unsigned int num_compute_streams_; + unsigned int num_transfer_streams_; + unsigned int last_sync_compute_streams_; + unsigned int last_sync_transfer_streams_; + unsigned int flags_; + ur_queue_flags_t ur_flags_; + // When compute_stream_sync_mutex_ and compute_stream_mutex_ both need to be + // locked at the same time, compute_stream_sync_mutex_ should be locked first + // to avoid deadlocks + std::mutex compute_stream_sync_mutex_; + std::mutex compute_stream_mutex_; + std::mutex transfer_stream_mutex_; + std::mutex barrier_mutex_; + + ur_queue_handle_t_(std::vector &&compute_streams, + std::vector &&transfer_streams, + ur_context_handle_t context, ur_device_handle_t device, + unsigned int flags, ur_queue_flags_t ur_flags) + : compute_streams_{std::move(compute_streams)}, + transfer_streams_{std::move(transfer_streams)}, + delay_compute_(compute_streams_.size(), false), + compute_applied_barrier_(compute_streams_.size()), + transfer_applied_barrier_(transfer_streams_.size()), context_{context}, + device_{device}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0}, + transfer_stream_idx_{0}, num_compute_streams_{0}, + num_transfer_streams_{0}, last_sync_compute_streams_{0}, + last_sync_transfer_streams_{0}, flags_(flags), ur_flags_(ur_flags) { + urContextRetain(context_); + urDeviceRetain(device_); + } + + ~ur_queue_handle_t_() { + urContextRelease(context_); + urDeviceRelease(device_); + } + + void compute_stream_wait_for_barrier_if_needed(hipStream_t stream, + uint32_t stream_i); + void transfer_stream_wait_for_barrier_if_needed(hipStream_t stream, + uint32_t stream_i); + + // get_next_compute/transfer_stream() functions return streams from + // appropriate pools in round-robin fashion + native_type get_next_compute_stream(uint32_t *stream_token = nullptr); + // this overload tries select a stream that was used by one of dependancies. + // If that is not possible returns a new stream. If a stream is reused it + // returns a lock that needs to remain locked as long as the stream is in use + native_type get_next_compute_stream(uint32_t num_events_in_wait_list, + const ur_event_handle_t *event_wait_list, + ur_stream_quard &guard, + uint32_t *stream_token = nullptr); + native_type get_next_transfer_stream(); + native_type get() { return get_next_compute_stream(); }; + + bool has_been_synchronized(uint32_t stream_token) { + // stream token not associated with one of the compute streams + if (stream_token == std::numeric_limits::max()) { + return false; + } + return last_sync_compute_streams_ > stream_token; + } + + bool can_reuse_stream(uint32_t stream_token) { + // stream token not associated with one of the compute streams + if (stream_token == std::numeric_limits::max()) { + return false; + } + // If the command represented by the stream token was not the last command + // enqueued to the stream we can not reuse the stream - we need to allow for + // commands enqueued after it and the one we are about to enqueue to run + // concurrently + bool is_last_command = + (compute_stream_idx_ - stream_token) <= compute_streams_.size(); + // If there was a barrier enqueued to the queue after the command + // represented by the stream token we should not reuse the stream, as we can + // not take that stream into account for the bookkeeping for the next + // barrier - such a stream would not be synchronized with. Performance-wise + // it does not matter that we do not reuse the stream, as the work + // represented by the stream token is guaranteed to be complete by the + // barrier before any work we are about to enqueue to the stream will start, + // so the event does not need to be synchronized with. + return is_last_command && !has_been_synchronized(stream_token); + } + + template bool all_of(T &&f) { + { + std::lock_guard compute_guard(compute_stream_mutex_); + unsigned int end = + std::min(static_cast(compute_streams_.size()), + num_compute_streams_); + if (!std::all_of(compute_streams_.begin(), compute_streams_.begin() + end, + f)) + return false; + } + { + std::lock_guard transfer_guard(transfer_stream_mutex_); + unsigned int end = + std::min(static_cast(transfer_streams_.size()), + num_transfer_streams_); + if (!std::all_of(transfer_streams_.begin(), + transfer_streams_.begin() + end, f)) + return false; + } + return true; + } + + template void for_each_stream(T &&f) { + { + std::lock_guard compute_guard(compute_stream_mutex_); + unsigned int end = + std::min(static_cast(compute_streams_.size()), + num_compute_streams_); + for (unsigned int i = 0; i < end; i++) { + f(compute_streams_[i]); + } + } + { + std::lock_guard transfer_guard(transfer_stream_mutex_); + unsigned int end = + std::min(static_cast(transfer_streams_.size()), + num_transfer_streams_); + for (unsigned int i = 0; i < end; i++) { + f(transfer_streams_[i]); + } + } + } + + template void sync_streams(T &&f) { + auto sync_compute = [&f, &streams = compute_streams_, + &delay = delay_compute_](unsigned int start, + unsigned int stop) { + for (unsigned int i = start; i < stop; i++) { + f(streams[i]); + delay[i] = false; + } + }; + auto sync_transfer = [&f, &streams = transfer_streams_](unsigned int start, + unsigned int stop) { + for (unsigned int i = start; i < stop; i++) { + f(streams[i]); + } + }; + { + unsigned int size = static_cast(compute_streams_.size()); + std::lock_guard compute_sync_guard(compute_stream_sync_mutex_); + std::lock_guard compute_guard(compute_stream_mutex_); + unsigned int start = last_sync_compute_streams_; + unsigned int end = num_compute_streams_ < size + ? num_compute_streams_ + : compute_stream_idx_.load(); + if (end - start >= size) { + sync_compute(0, size); + } else { + start %= size; + end %= size; + if (start < end) { + sync_compute(start, end); + } else { + sync_compute(start, size); + sync_compute(0, end); + } + } + if (ResetUsed) { + last_sync_compute_streams_ = end; + } + } + { + unsigned int size = static_cast(transfer_streams_.size()); + if (size > 0) { + std::lock_guard transfer_guard(transfer_stream_mutex_); + unsigned int start = last_sync_transfer_streams_; + unsigned int end = num_transfer_streams_ < size + ? num_transfer_streams_ + : transfer_stream_idx_.load(); + if (end - start >= size) { + sync_transfer(0, size); + } else { + start %= size; + end %= size; + if (start < end) { + sync_transfer(start, end); + } else { + sync_transfer(start, size); + sync_transfer(0, end); + } + } + if (ResetUsed) { + last_sync_transfer_streams_ = end; + } + } + } + } + + ur_context_handle_t get_context() const { return context_; }; + + ur_device_handle_t get_device() const { return device_; }; + + uint32_t increment_reference_count() noexcept { return ++refCount_; } + + uint32_t decrement_reference_count() noexcept { return --refCount_; } + + uint32_t get_reference_count() const noexcept { return refCount_; } + + uint32_t get_next_event_id() noexcept { return ++eventCount_; } +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index cb90edcd81c0b..5e274dd518da6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -66,14 +66,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnGetProfilingInfo = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; - pDdiTable->pfnSetCallback = nullptr; - pDdiTable->pfnWait = nullptr; + pDdiTable->pfnCreateWithNativeHandle = urEventCreateWithNativeHandle; + pDdiTable->pfnGetInfo = urEventGetInfo; + pDdiTable->pfnGetNativeHandle = urEventGetNativeHandle; + pDdiTable->pfnGetProfilingInfo = urEventGetProfilingInfo; + pDdiTable->pfnRelease = urEventRelease; + pDdiTable->pfnRetain = urEventRetain; + pDdiTable->pfnSetCallback = urEventSetCallback; + pDdiTable->pfnWait = urEventWait; return UR_RESULT_SUCCESS; } @@ -209,14 +209,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnCreate = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnFinish = nullptr; - pDdiTable->pfnFlush = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnCreate = urQueueCreate; + pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; + pDdiTable->pfnFinish = urQueueFinish; + pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnGetInfo = urQueueGetInfo; + pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; + pDdiTable->pfnRelease = urQueueRelease; + pDdiTable->pfnRetain = urQueueRetain; return UR_RESULT_SUCCESS; } From 917faef73a138d80015e4c8af5ca31b5b8effeef Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 18 May 2023 12:59:57 +0100 Subject: [PATCH 08/42] [SYCL][HIP][UR] change the fixup for info queries with bool return type --- sycl/plugins/unified_runtime/pi2ur.hpp | 724 ++----------------------- sycl/plugins/unified_runtime/ur/ur.hpp | 9 + 2 files changed, 66 insertions(+), 667 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 432baa3224f31..f5ad4494e929d 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -217,6 +217,40 @@ inline pi_result ur2piPlatformInfoValue(ur_platform_info_t ParamName, (int)ParamValueSizePI, (int)*ParamValueSizeUR); die("ur2piPlatformInfoValue: size mismatch"); } + + return PI_SUCCESS; +} + +// Handle mismatched PI and UR type return sizes for info queries +inline pi_result fixupInfoValueTypes(size_t ParamValueSizeRetUR, + size_t *ParamValueSizeRetPI, + size_t ParamValueSize, void *ParamValue) { + if (ParamValueSizeRetUR == 1 && ParamValueSize == 4) { + // extend bool to pi_bool (uint32_t) + if (ParamValue) { + auto *ValIn = static_cast(ParamValue); + auto *ValOut = static_cast(ParamValue); + *ValOut = static_cast(*ValIn); + } + if (ParamValueSizeRetPI) { + *ParamValueSizeRetPI = sizeof(pi_bool); + } + } + + return PI_SUCCESS; +} + +template +inline pi_result +ConvertInputBitfield(pi_bitfield in, TypeOut *out, + const std::unordered_map &map) { + *out = 0; + for (auto &[FlagPI, FlagUR] : map) { + if (in & FlagPI) { + *out |= FlagUR; + } + } + return PI_SUCCESS; } @@ -599,13 +633,7 @@ inline pi_result piPlatformGetInfo(pi_platform Platform, HANDLE_ERRORS(urPlatformGetInfo(UrPlatform, UrParamName, ParamValueSize, ParamValue, &UrParamValueSizeRet)); - if (ParamValueSizeRet) { - *ParamValueSizeRet = UrParamValueSizeRet; - } - ur2piPlatformInfoValue(UrParamName, ParamValueSize, &ParamValueSize, - ParamValue); - fixupInfoValueTypes(UrParamValueSizeRet, ParamValueSizeRet, ParamValueSize, - ParamValue); + ur2piPlatformInfoValue(InfoType, ParamValueSize, &SizeInOut, ParamValue); return PI_SUCCESS; } @@ -1021,17 +1049,14 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, return PI_ERROR_UNKNOWN; }; - PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); - size_t UrParamValueSizeRet; - auto UrDevice = reinterpret_cast(Device); - - HANDLE_ERRORS(urDeviceGetInfo(UrDevice, InfoType, ParamValueSize, ParamValue, + auto hDevice = reinterpret_cast(Device); + HANDLE_ERRORS(urDeviceGetInfo(hDevice, InfoType, ParamValueSize, ParamValue, &UrParamValueSizeRet)); - if (ParamValueSizeRet) { *ParamValueSizeRet = UrParamValueSizeRet; } + ur2piDeviceInfoValue(InfoType, ParamValueSize, &ParamValueSize, ParamValue); fixupInfoValueTypes(UrParamValueSizeRet, ParamValueSizeRet, ParamValueSize, ParamValue); @@ -1225,7 +1250,17 @@ inline pi_result piextContextSetExtendedDeleter( pi_context Context, pi_context_extended_deleter Function, void *UserData) { auto hContext = reinterpret_cast(Context); - HANDLE_ERRORS(urContextSetExtendedDeleter(hContext, Function, UserData)); + size_t UrParamValueSizeRet; + auto hContext = reinterpret_cast(context); + HANDLE_ERRORS(urContextGetInfo(hContext, InfoType->second, param_value_size, + param_value, &UrParamValueSizeRet)); + + if (param_value_size_ret) { + *param_value_size_ret = UrParamValueSizeRet; + } + + fixupInfoValueTypes(UrParamValueSizeRet, param_value_size_ret, + param_value_size, param_value); return PI_SUCCESS; } @@ -2854,660 +2889,15 @@ inline pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context, PI_ERROR_INVALID_VALUE); } - ur_context_handle_t UrContext = - reinterpret_cast(Context); - auto UrDevice = reinterpret_cast(Device); - - ur_usm_desc_t USMDesc{}; - ur_usm_device_desc_t UsmDeviceDesc{}; - UsmDeviceDesc.stype = UR_STRUCTURE_TYPE_USM_DEVICE_DESC; - ur_usm_host_desc_t UsmHostDesc{}; - UsmHostDesc.stype = UR_STRUCTURE_TYPE_USM_HOST_DESC; - if (Properties) { - if (Properties[0] == PI_MEM_ALLOC_FLAGS) { - if (Properties[1] == PI_MEM_ALLOC_WRTITE_COMBINED) { - UsmDeviceDesc.flags |= UR_USM_DEVICE_MEM_FLAG_WRITE_COMBINED; - } - if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE) { - UsmDeviceDesc.flags |= UR_USM_DEVICE_MEM_FLAG_INITIAL_PLACEMENT; - } - if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_HOST) { - UsmHostDesc.flags |= UR_USM_HOST_MEM_FLAG_INITIAL_PLACEMENT; - } - if (Properties[1] == PI_MEM_ALLOC_DEVICE_READ_ONLY) { - UsmDeviceDesc.flags |= UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY; - } - } - } - UsmDeviceDesc.pNext = &UsmHostDesc; - USMDesc.pNext = &UsmDeviceDesc; - - USMDesc.align = Alignment; - - ur_usm_pool_handle_t Pool{}; - HANDLE_ERRORS( - urUSMSharedAlloc(UrContext, UrDevice, &USMDesc, Pool, Size, ResultPtr)); - - return PI_SUCCESS; -} - -inline pi_result piextUSMFree(pi_context Context, void *Ptr) { - ur_context_handle_t UrContext = - reinterpret_cast(Context); - HANDLE_ERRORS(urUSMFree(UrContext, Ptr)); - return PI_SUCCESS; -} - -inline pi_result piMemRetain(pi_mem Mem) { - PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); - - ur_mem_handle_t UrMem = reinterpret_cast(Mem); - - HANDLE_ERRORS(urMemRetain(UrMem)); - - return PI_SUCCESS; -} - -inline pi_result piMemRelease(pi_mem Mem) { - PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); - - ur_mem_handle_t UrMem = reinterpret_cast(Mem); - - HANDLE_ERRORS(urMemRelease(UrMem)); - - return PI_SUCCESS; -} - -/// Hint to migrate memory to the device -/// -/// @param Queue is the queue to submit to -/// @param Ptr points to the memory to migrate -/// @param Size is the number of bytes to migrate -/// @param Flags is a bitfield used to specify memory migration options -/// @param NumEventsInWaitList is the number of events to wait on -/// @param EventsWaitList is an array of events to wait on -/// @param Event is the event that represents this operation -inline pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, - size_t Size, - pi_usm_migration_flags Flags, - pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, - pi_event *OutEvent) { - - // flags is currently unused so fail if set - PI_ASSERT(Flags == 0, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - // TODO: to map from pi_usm_migration_flags to - // ur_usm_migration_flags_t - // once we have those defined - ur_usm_migration_flags_t UrFlags{}; - HANDLE_ERRORS(urEnqueueUSMPrefetch(UrQueue, Ptr, Size, UrFlags, - NumEventsInWaitList, UrEventsWaitList, - UrEvent)); - - return PI_SUCCESS; -} - -/// USM memadvise API to govern behavior of automatic migration mechanisms -/// -/// @param Queue is the queue to submit to -/// @param Ptr is the data to be advised -/// @param Length is the size in bytes of the meory to advise -/// @param Advice is device specific advice -/// @param Event is the event that represents this operation -/// -inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, - size_t Length, pi_mem_advice Advice, - pi_event *OutEvent) { - - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - ur_usm_advice_flags_t UrAdvice{}; - if (Advice & PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY) { - UrAdvice |= UR_USM_ADVICE_FLAG_SET_READ_MOSTLY; - } - if (Advice & PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY) { - UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY; - } - if (Advice & PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION) { - UrAdvice |= UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION; - } - if (Advice & PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION) { - UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION; - } - if (Advice & PI_MEM_ADVICE_RESET) { - UrAdvice |= UR_USM_ADVICE_FLAG_DEFAULT; - } - - HANDLE_ERRORS(urEnqueueUSMAdvise(UrQueue, Ptr, Length, UrAdvice, UrEvent)); - - return PI_SUCCESS; -} - -/// USM 2D Fill API -/// -/// \param queue is the queue to submit to -/// \param ptr is the ptr to fill -/// \param pitch is the total width of the destination memory including padding -/// \param pattern is a pointer with the bytes of the pattern to set -/// \param pattern_size is the size in bytes of the pattern -/// \param width is width in bytes of each row to fill -/// \param height is height the columns to fill -/// \param num_events_in_waitlist is the number of events to wait on -/// \param events_waitlist is an array of events to wait on -/// \param event is the event that represents this operation -inline pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr, size_t Pitch, - size_t PatternSize, const void *Pattern, - size_t Width, size_t Height, - pi_uint32 NumEventsWaitList, - const pi_event *EventsWaitList, - pi_event *Event) { - - auto hQueue = reinterpret_cast(Queue); - auto phEventWaitList = - reinterpret_cast(EventsWaitList); - auto phEvent = reinterpret_cast(Event); - - HANDLE_ERRORS(urEnqueueUSMFill2D(hQueue, Ptr, Pitch, PatternSize, Pattern, - Width, Height, NumEventsWaitList, - phEventWaitList, phEvent)); - - return PI_SUCCESS; -} - -inline pi_result piextUSMEnqueueMemset2D(pi_queue Queue, void *Ptr, - size_t Pitch, int Value, size_t Width, - size_t Height, - pi_uint32 NumEventsWaitList, - const pi_event *EventsWaitList, - pi_event *Event) { - std::ignore = Queue; - std::ignore = Ptr; - std::ignore = Pitch; - std::ignore = Value; - std::ignore = Width; - std::ignore = Height; - std::ignore = NumEventsWaitList; - std::ignore = EventsWaitList; - std::ignore = Event; - die("piextUSMEnqueueMemset2D: not implemented"); - return PI_SUCCESS; -} - -inline pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr, - pi_mem_alloc_info ParamName, - size_t ParamValueSize, - void *ParamValue, - size_t *ParamValueSizeRet) { - - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - - ur_context_handle_t UrContext = - reinterpret_cast(Context); - - ur_usm_alloc_info_t UrParamName{}; - switch (ParamName) { - case PI_MEM_ALLOC_TYPE: { - UrParamName = UR_USM_ALLOC_INFO_TYPE; - break; - } - case PI_MEM_ALLOC_BASE_PTR: { - UrParamName = UR_USM_ALLOC_INFO_BASE_PTR; - break; - } - case PI_MEM_ALLOC_SIZE: { - UrParamName = UR_USM_ALLOC_INFO_SIZE; - break; - } - case PI_MEM_ALLOC_DEVICE: { - UrParamName = UR_USM_ALLOC_INFO_DEVICE; - break; - } - default: { - die("piextUSMGetMemAllocInfo: unsuppported ParamName."); - } - } - - size_t SizeInOut = ParamValueSize; - HANDLE_ERRORS(urUSMGetMemAllocInfo(UrContext, Ptr, UrParamName, - ParamValueSize, ParamValue, - ParamValueSizeRet)) - ur2piUSMAllocInfoValue(UrParamName, ParamValueSize, &SizeInOut, ParamValue); - return PI_SUCCESS; -} - -inline pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName, - size_t ParamValueSize, void *ParamValue, - size_t *ParamValueSizeRet) { - - auto hMem = reinterpret_cast(Image); - - ur_image_info_t UrParamName{}; - switch (ParamName) { - case PI_IMAGE_INFO_FORMAT: { - UrParamName = UR_IMAGE_INFO_FORMAT; - break; - } - case PI_IMAGE_INFO_ELEMENT_SIZE: { - UrParamName = UR_IMAGE_INFO_ELEMENT_SIZE; - break; - } - case PI_IMAGE_INFO_ROW_PITCH: { - UrParamName = UR_IMAGE_INFO_ROW_PITCH; - break; - } - case PI_IMAGE_INFO_SLICE_PITCH: { - UrParamName = UR_IMAGE_INFO_SLICE_PITCH; - break; - } - case PI_IMAGE_INFO_WIDTH: { - UrParamName = UR_IMAGE_INFO_WIDTH; - break; - } - case PI_IMAGE_INFO_HEIGHT: { - UrParamName = UR_IMAGE_INFO_HEIGHT; - break; - } - case PI_IMAGE_INFO_DEPTH: { - UrParamName = UR_IMAGE_INFO_DEPTH; - break; - } - default: - return PI_ERROR_UNKNOWN; - } - - HANDLE_ERRORS(urMemImageGetInfo(hMem, UrParamName, ParamValueSize, ParamValue, - ParamValueSizeRet)); - return PI_SUCCESS; -} - -/// USM 2D Memcpy API -/// -/// \param queue is the queue to submit to -/// \param blocking is whether this operation should block the host -/// \param dst_ptr is the location the data will be copied -/// \param dst_pitch is the total width of the destination memory including -/// padding -/// \param src_ptr is the data to be copied -/// \param dst_pitch is the total width of the source memory including padding -/// \param width is width in bytes of each row to be copied -/// \param height is height the columns to be copied -/// \param num_events_in_waitlist is the number of events to wait on -/// \param events_waitlist is an array of events to wait on -/// \param event is the event that represents this operation -inline pi_result piextUSMEnqueueMemcpy2D(pi_queue Queue, pi_bool Blocking, - void *DstPtr, size_t DstPitch, - const void *SrcPtr, size_t SrcPitch, - size_t Width, size_t Height, - pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, - pi_event *OutEvent) { - - if (!DstPtr || !SrcPtr) - return PI_ERROR_INVALID_VALUE; - - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - HANDLE_ERRORS(urEnqueueUSMMemcpy2D( - UrQueue, Blocking, DstPtr, DstPitch, SrcPtr, SrcPitch, Width, Height, - NumEventsInWaitList, UrEventsWaitList, UrEvent)); - - return PI_SUCCESS; -} - -// Memory -/////////////////////////////////////////////////////////////////////////////// - -/////////////////////////////////////////////////////////////////////////////// -// Enqueue - -inline pi_result -piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, - const size_t *GlobalWorkOffset, - const size_t *GlobalWorkSize, const size_t *LocalWorkSize, - pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, pi_event *OutEvent) { - - PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - HANDLE_ERRORS(urEnqueueKernelLaunch( - UrQueue, UrKernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumEventsInWaitList, UrEventsWaitList, UrEvent)); - - return PI_SUCCESS; -} - -inline pi_result -piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, pi_bool BlockingWrite, - pi_image_offset Origin, pi_image_region Region, - size_t InputRowPitch, size_t InputSlicePitch, - const void *Ptr, pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, pi_event *OutEvent) { - - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - ur_mem_handle_t UrImage = reinterpret_cast(Image); - ur_rect_offset_t UrOrigin{Origin->x, Origin->y, Origin->z}; - ur_rect_region_t UrRegion{}; - UrRegion.depth = Region->depth; - UrRegion.height = Region->height; - UrRegion.width = Region->width; - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - HANDLE_ERRORS(urEnqueueMemImageWrite( - UrQueue, UrImage, BlockingWrite, UrOrigin, UrRegion, InputRowPitch, - InputSlicePitch, const_cast(Ptr), NumEventsInWaitList, - UrEventsWaitList, UrEvent)); - - return PI_SUCCESS; -} - -inline pi_result -piEnqueueMemImageRead(pi_queue Queue, pi_mem Image, pi_bool BlockingRead, - pi_image_offset Origin, pi_image_region Region, - size_t RowPitch, size_t SlicePitch, void *Ptr, - pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, pi_event *OutEvent) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - ur_mem_handle_t UrImage = reinterpret_cast(Image); - ur_rect_offset_t UrOrigin{Origin->x, Origin->y, Origin->z}; - ur_rect_region_t UrRegion{}; - UrRegion.depth = Region->depth; - UrRegion.height = Region->height; - UrRegion.width = Region->width; - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - HANDLE_ERRORS(urEnqueueMemImageRead( - UrQueue, UrImage, BlockingRead, UrOrigin, UrRegion, RowPitch, SlicePitch, - Ptr, NumEventsInWaitList, UrEventsWaitList, UrEvent)); - - return PI_SUCCESS; -} - -inline pi_result piEnqueueMemBufferMap( - pi_queue Queue, pi_mem Mem, pi_bool BlockingMap, pi_map_flags MapFlags, - size_t Offset, size_t Size, pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, pi_event *OutEvent, void **RetMap) { - // TODO: we don't implement read-only or write-only, always read-write. - // assert((map_flags & PI_MAP_READ) != 0); - // assert((map_flags & PI_MAP_WRITE) != 0); - PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - ur_mem_handle_t UrMem = reinterpret_cast(Mem); - - ur_map_flags_t UrMapFlags{}; - if (MapFlags & PI_MAP_READ) - UrMapFlags |= UR_MAP_FLAG_READ; - if (MapFlags & PI_MAP_WRITE) - UrMapFlags |= UR_MAP_FLAG_WRITE; - if (MapFlags & PI_MAP_WRITE_INVALIDATE_REGION) - UrMapFlags |= UR_MAP_FLAG_WRITE_INVALIDATE_REGION; - - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - HANDLE_ERRORS(urEnqueueMemBufferMap(UrQueue, UrMem, BlockingMap, UrMapFlags, - Offset, Size, NumEventsInWaitList, - UrEventsWaitList, UrEvent, RetMap)); - - return PI_SUCCESS; -} - -inline pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem, void *MappedPtr, - pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, - pi_event *OutEvent) { - - PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - ur_mem_handle_t UrMem = reinterpret_cast(Mem); - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - HANDLE_ERRORS(urEnqueueMemUnmap(UrQueue, UrMem, MappedPtr, - NumEventsInWaitList, UrEventsWaitList, - UrEvent)); - - return PI_SUCCESS; -} - -inline pi_result piEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer, - const void *Pattern, size_t PatternSize, - size_t Offset, size_t Size, - pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, - pi_event *OutEvent) { - PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - HANDLE_ERRORS(urEnqueueMemBufferFill(UrQueue, UrBuffer, Pattern, PatternSize, - Offset, Size, NumEventsInWaitList, - UrEventsWaitList, UrEvent)); - return PI_SUCCESS; -} - -inline pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, - pi_int32 Value, size_t Count, - pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, - pi_event *OutEvent) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - if (!Ptr) { - return PI_ERROR_INVALID_VALUE; + size_t UrParamValueSizeRet; + auto hSampler = reinterpret_cast(Sampler); + HANDLE_ERRORS(urSamplerGetInfo(hSampler, InfoType->second, ParamValueSize, + ParamValue, &UrParamValueSizeRet)); + if (ParamValueSizeRet) { + *ParamValueSizeRet = UrParamValueSizeRet; } - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - size_t PatternSize = 1; - HANDLE_ERRORS(urEnqueueUSMFill(UrQueue, Ptr, PatternSize, &Value, Count, - NumEventsInWaitList, UrEventsWaitList, - UrEvent)); - - return PI_SUCCESS; -} - -inline pi_result piEnqueueMemBufferCopyRect( - pi_queue Queue, pi_mem SrcMem, pi_mem DstMem, pi_buff_rect_offset SrcOrigin, - pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region, - size_t SrcRowPitch, size_t SrcSlicePitch, size_t DstRowPitch, - size_t DstSlicePitch, pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, pi_event *OutEvent) { - - PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - ur_mem_handle_t UrBufferSrc = reinterpret_cast(SrcMem); - ur_mem_handle_t UrBufferDst = reinterpret_cast(DstMem); - ur_rect_offset_t UrSrcOrigin{SrcOrigin->x_bytes, SrcOrigin->y_scalar, - SrcOrigin->z_scalar}; - ur_rect_offset_t UrDstOrigin{DstOrigin->x_bytes, DstOrigin->y_scalar, - DstOrigin->z_scalar}; - ur_rect_region_t UrRegion{}; - UrRegion.depth = Region->depth_scalar; - UrRegion.height = Region->height_scalar; - UrRegion.width = Region->width_bytes; - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - HANDLE_ERRORS(urEnqueueMemBufferCopyRect( - UrQueue, UrBufferSrc, UrBufferDst, UrSrcOrigin, UrDstOrigin, UrRegion, - SrcRowPitch, SrcSlicePitch, DstRowPitch, DstSlicePitch, - NumEventsInWaitList, UrEventsWaitList, UrEvent)); - - return PI_SUCCESS; -} - -inline pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcMem, - pi_mem DstMem, size_t SrcOffset, - size_t DstOffset, size_t Size, - pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, - pi_event *OutEvent) { - - PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - ur_mem_handle_t UrBufferSrc = reinterpret_cast(SrcMem); - ur_mem_handle_t UrBufferDst = reinterpret_cast(DstMem); - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - HANDLE_ERRORS(urEnqueueMemBufferCopy( - UrQueue, UrBufferSrc, UrBufferDst, SrcOffset, DstOffset, Size, - NumEventsInWaitList, UrEventsWaitList, UrEvent)); - - return PI_SUCCESS; -} - -inline pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, - void *DstPtr, const void *SrcPtr, - size_t Size, - pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, - pi_event *OutEvent) { - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - HANDLE_ERRORS(urEnqueueUSMMemcpy(UrQueue, Blocking, DstPtr, SrcPtr, Size, - NumEventsInWaitList, UrEventsWaitList, - UrEvent)); - - return PI_SUCCESS; -} - -inline pi_result piEnqueueMemBufferWriteRect( - pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite, - pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, - pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, - size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr, - pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList, - pi_event *OutEvent) { - - PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); - ur_rect_offset_t UrBufferOffset{BufferOffset->x_bytes, BufferOffset->y_scalar, - BufferOffset->z_scalar}; - ur_rect_offset_t UrHostOffset{HostOffset->x_bytes, HostOffset->y_scalar, - HostOffset->z_scalar}; - ur_rect_region_t UrRegion{}; - UrRegion.depth = Region->depth_scalar; - UrRegion.height = Region->height_scalar; - UrRegion.width = Region->width_bytes; - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - HANDLE_ERRORS(urEnqueueMemBufferWriteRect( - UrQueue, UrBuffer, BlockingWrite, UrBufferOffset, UrHostOffset, UrRegion, - BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, - const_cast(Ptr), NumEventsInWaitList, UrEventsWaitList, UrEvent)); - - return PI_SUCCESS; -} - -inline pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer, - pi_bool BlockingWrite, size_t Offset, - size_t Size, const void *Ptr, - pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, - pi_event *OutEvent) { - - PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); - const ur_event_handle_t *UrEventsWaitList = - reinterpret_cast(EventsWaitList); - - ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - - HANDLE_ERRORS(urEnqueueMemBufferWrite( - UrQueue, UrBuffer, BlockingWrite, Offset, Size, const_cast(Ptr), - NumEventsInWaitList, UrEventsWaitList, UrEvent)); - - return PI_SUCCESS; -} - -inline pi_result piEnqueueMemBufferReadRect( - pi_queue Queue, pi_mem Buffer, pi_bool BlockingRead, - pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, - pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, - size_t HostRowPitch, size_t HostSlicePitch, void *Ptr, - pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList, - pi_event *OutEvent) { - - PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + fixupInfoValueTypes(UrParamValueSizeRet, ParamValueSizeRet, ParamValueSize, + ParamValue); ur_queue_handle_t UrQueue = reinterpret_cast(Queue); ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index 6978c6f2838a6..5777ddf0da99e 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -309,3 +309,12 @@ class UrReturnHelper { void *param_value; size_t *param_value_size_ret; }; + +// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR +constexpr size_t MaxMessageSize = 256; +extern thread_local ur_result_t ErrorMessageCode; +extern thread_local char ErrorMessage[MaxMessageSize]; + +// Utility function for setting a message and warning +[[maybe_unused]] void setErrorMessage(const char *message, + ur_result_t error_code); From ea26e248ffc5c643c15ce3dc6fe96227020def7d Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 18 May 2023 15:47:40 +0100 Subject: [PATCH 09/42] [SYCL][HIP][UR][PI] Port remaining device entry-points --- sycl/plugins/hip/pi_hip.cpp | 95 +------------------ .../ur/adapters/hip/device.cpp | 65 +++++++++++++ .../ur/adapters/hip/program.cpp | 25 +++++ .../ur/adapters/hip/ur_interface_loader.cpp | 6 +- 4 files changed, 96 insertions(+), 95 deletions(-) diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 746cc82a73b1b..0977ec7210ac1 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -403,64 +403,6 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { //-- PI API implementation extern "C" { -/// \return If available, the first binary that is PTX -/// -pi_result hip_piextDeviceSelectBinary(pi_device device, - pi_device_binary *binaries, - pi_uint32 num_binaries, - pi_uint32 *selected_binary) { - (void)device; - if (!binaries) { - sycl::detail::pi::die("No list of device images provided"); - } - if (num_binaries < 1) { - sycl::detail::pi::die("No binary images in the list"); - } - - // Look for an image for the HIP target, and return the first one that is - // found -#if defined(__HIP_PLATFORM_AMD__) - const char *binary_type = __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN; -#elif defined(__HIP_PLATFORM_NVIDIA__) - const char *binary_type = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64; -#else -#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); -#endif - - for (pi_uint32 i = 0; i < num_binaries; i++) { - if (strcmp(binaries[i]->DeviceTargetSpec, binary_type) == 0) { - *selected_binary = i; - return PI_SUCCESS; - } - } - - // No image can be loaded for the given device - return PI_ERROR_INVALID_BINARY; -} - -pi_result hip_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device, - pi_program program, - const char *func_name, - pi_uint64 *func_pointer_ret) { - // Check if device passed is the same the device bound to the context - assert(device == program->get_context()->get_device()); - assert(func_pointer_ret != nullptr); - - hipFunction_t func; - hipError_t ret = hipModuleGetFunction(&func, program->get(), func_name); - *func_pointer_ret = reinterpret_cast(func); - pi_result retError = PI_SUCCESS; - - if (ret != hipSuccess && ret != hipErrorNotFound) - retError = PI_CHECK_ERROR(ret); - if (ret == hipErrorNotFound) { - *func_pointer_ret = 0; - retError = PI_ERROR_INVALID_KERNEL_NAME; - } - - return retError; -} - pi_result hip_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, size_t offset, size_t size, void *ptr, @@ -2014,37 +1956,6 @@ pi_result hip_piextEnqueueWriteHostPipe( return {}; } -pi_result hip_piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, - uint64_t *HostTime) { - if (!DeviceTime && !HostTime) - return PI_SUCCESS; - - _pi_event::native_type event; - - ScopedContext active(Device->get_context()); - - if (DeviceTime) { - PI_CHECK_ERROR(hipEventCreateWithFlags(&event, hipEventDefault)); - PI_CHECK_ERROR(hipEventRecord(event)); - } - if (HostTime) { - using namespace std::chrono; - *HostTime = - duration_cast(steady_clock::now().time_since_epoch()) - .count(); - } - - if (DeviceTime) { - PI_CHECK_ERROR(hipEventSynchronize(event)); - - float elapsedTime = 0.0f; - PI_CHECK_ERROR(hipEventElapsedTime(&elapsedTime, - ur_platform_handle_t_::evBase_, event)); - *DeviceTime = (uint64_t)(elapsedTime * (double)1e6); - } - return PI_SUCCESS; -} - const char SupportedVersion[] = _PI_HIP_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { @@ -2075,8 +1986,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piDevicePartition, pi2ur::piDevicePartition) _PI_CL(piDeviceRetain, pi2ur::piDeviceRetain) _PI_CL(piDeviceRelease, pi2ur::piDeviceRelease) - _PI_CL(piextDeviceSelectBinary, hip_piextDeviceSelectBinary) - _PI_CL(piextGetDeviceFunctionPointer, hip_piextGetDeviceFunctionPointer) + _PI_CL(piextDeviceSelectBinary, pi2ur::piextDeviceSelectBinary) + _PI_CL(piextGetDeviceFunctionPointer, pi2ur::piextGetDeviceFunctionPointer) _PI_CL(piextDeviceGetNativeHandle, pi2ur::piextDeviceGetNativeHandle) _PI_CL(piextDeviceCreateWithNativeHandle, pi2ur::piextDeviceCreateWithNativeHandle) @@ -2202,7 +2113,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextKernelSetArgSampler, hip_piextKernelSetArgSampler) _PI_CL(piPluginGetLastError, hip_piPluginGetLastError) _PI_CL(piTearDown, pi2ur::piTearDown) - _PI_CL(piGetDeviceAndHostTimer, hip_piGetDeviceAndHostTimer) + _PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer) _PI_CL(piPluginGetBackendOption, hip_piPluginGetBackendOption) #undef _PI_CL diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index a537fe58328d5..032f113bd0d78 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -8,6 +8,7 @@ #include "device.hpp" #include "context.hpp" +#include "event.hpp" #include @@ -927,3 +928,67 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( return UR_RESULT_ERROR_INVALID_OPERATION; } + +/// \return If available, the first binary that is PTX +/// +UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( + ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries, + uint32_t NumBinaries, uint32_t *pSelectedBinary) { + // Ignore unused parameter + (void)hDevice; + + UR_ASSERT(pBinaries, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(NumBinaries > 0, UR_RESULT_ERROR_INVALID_ARGUMENT); + + // Look for an image for the HIP target, and return the first one that is + // found +#if defined(__HIP_PLATFORM_AMD__) + const char *binary_type = UR_DEVICE_BINARY_TARGET_AMDGCN; +#elif defined(__HIP_PLATFORM_NVIDIA__) + const char *binary_type = UR_DEVICE_BINARY_TARGET_NVPTX64; +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif + for (uint32_t i = 0; i < NumBinaries; i++) { + if (strcmp(pBinaries[i].pDeviceTargetSpec, binary_type) == 0) { + *pSelectedBinary = i; + return UR_RESULT_SUCCESS; + } + } + + // No image can be loaded for the given device + return UR_RESULT_ERROR_INVALID_BINARY; +} + +ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, + uint64_t *pDeviceTimestamp, + uint64_t *pHostTimestamp) { + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + if (!pDeviceTimestamp && !pHostTimestamp) + return UR_RESULT_SUCCESS; + + ur_event_handle_t_::native_type event; + ScopedContext active(hDevice->get_context()); + + if (pDeviceTimestamp) { + UR_CHECK_ERROR(hipEventCreateWithFlags(&event, hipEventDefault)); + UR_CHECK_ERROR(hipEventRecord(event)); + } + if (pHostTimestamp) { + using namespace std::chrono; + *pHostTimestamp = + duration_cast(steady_clock::now().time_since_epoch()) + .count(); + } + + if (pDeviceTimestamp) { + UR_CHECK_ERROR(hipEventSynchronize(event)); + float elapsedTime = 0.0f; + UR_CHECK_ERROR(hipEventElapsedTime(&elapsedTime, + ur_platform_handle_t_::evBase_, event)); + *pDeviceTimestamp = (uint64_t)(elapsedTime * (double)1e6); + } + + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp index 9420c30982975..3f6cbd9eb223c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp @@ -299,3 +299,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants( ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( + ur_device_handle_t hDevice, ur_program_handle_t hProgram, + const char *pFunctionName, void **ppFunctionPointer) { + // Check if device passed is the same the device bound to the context + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hDevice == hProgram->get_context()->get_device(), + UR_RESULT_ERROR_INVALID_DEVICE); + UR_ASSERT(ppFunctionPointer, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + hipFunction_t func; + hipError_t ret = hipModuleGetFunction(&func, hProgram->get(), pFunctionName); + *ppFunctionPointer = func; + ur_result_t retError = UR_RESULT_SUCCESS; + + if (ret != hipSuccess && ret != hipErrorNotFound) + retError = UR_CHECK_ERROR(ret); + if (ret == hipErrorNotFound) { + *ppFunctionPointer = 0; + retError = UR_RESULT_ERROR_INVALID_FUNCTION_NAME; + } + + return retError; +} \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index 5e274dd518da6..2fd5ffc9d0cf7 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -89,7 +89,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( pDdiTable->pfnCreateWithIL = urProgramCreateWithIL; pDdiTable->pfnCreateWithNativeHandle = urProgramCreateWithNativeHandle; pDdiTable->pfnGetBuildInfo = urProgramGetBuildInfo; - pDdiTable->pfnGetFunctionPointer = nullptr; + pDdiTable->pfnGetFunctionPointer = urProgramGetFunctionPointer; pDdiTable->pfnGetInfo = urProgramGetInfo; pDdiTable->pfnGetNativeHandle = urProgramGetNativeHandle; pDdiTable->pfnLink = urProgramLink; @@ -245,13 +245,13 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( } pDdiTable->pfnCreateWithNativeHandle = urDeviceCreateWithNativeHandle; pDdiTable->pfnGet = urDeviceGet; - pDdiTable->pfnGetGlobalTimestamps = nullptr; + pDdiTable->pfnGetGlobalTimestamps = urDeviceGetGlobalTimestamps; pDdiTable->pfnGetInfo = urDeviceGetInfo; pDdiTable->pfnGetNativeHandle = urDeviceGetNativeHandle; pDdiTable->pfnPartition = urDevicePartition; pDdiTable->pfnRelease = urDeviceRelease; pDdiTable->pfnRetain = urDeviceRetain; - pDdiTable->pfnSelectBinary = nullptr; + pDdiTable->pfnSelectBinary = urDeviceSelectBinary; return UR_RESULT_SUCCESS; } From ae20a7013a736172742180577bddee7366e448a8 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Thu, 18 May 2023 15:50:53 +0100 Subject: [PATCH 10/42] [SYCL][HIP][UR] Port kernelSetArgMem/Sampler entry points --- sycl/plugins/hip/pi_hip.cpp | 63 +------------------ .../ur/adapters/hip/kernel.cpp | 53 ++++++++++++++++ .../ur/adapters/hip/ur_interface_loader.cpp | 4 +- 3 files changed, 57 insertions(+), 63 deletions(-) diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 0977ec7210ac1..acf8f47d864af 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -494,65 +494,6 @@ pi_result hip_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, return retErr; } -pi_result hip_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, - const pi_mem *arg_value) { - - assert(kernel != nullptr); - assert(arg_value != nullptr); - - // Below sets kernel arg when zero-sized buffers are handled. - // In such case the corresponding memory is null. - if (*arg_value == nullptr) { - kernel->set_kernel_arg(arg_index, 0, nullptr); - return PI_SUCCESS; - } - - pi_result retErr = PI_SUCCESS; - try { - pi_mem arg_mem = *arg_value; - - if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) { - auto array = arg_mem->mem_.surface_mem_.get_array(); - hipArray_Format Format; - size_t NumChannels; - getArrayDesc(array, Format, NumChannels); - if (Format != HIP_AD_FORMAT_UNSIGNED_INT32 && - Format != HIP_AD_FORMAT_SIGNED_INT32 && - Format != HIP_AD_FORMAT_HALF && Format != HIP_AD_FORMAT_FLOAT) { - sycl::detail::pi::die( - "PI HIP kernels only support images with channel types int32, " - "uint32, float, and half."); - } - hipSurfaceObject_t hipSurf = arg_mem->mem_.surface_mem_.get_surface(); - kernel->set_kernel_arg(arg_index, sizeof(hipSurf), (void *)&hipSurf); - } else - - { - void *hipPtr = arg_mem->mem_.buffer_mem_.get_void(); - kernel->set_kernel_arg(arg_index, sizeof(void *), (void *)&hipPtr); - } - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result hip_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, - const pi_sampler *arg_value) { - - assert(kernel != nullptr); - assert(arg_value != nullptr); - - pi_result retErr = PI_SUCCESS; - try { - pi_uint32 samplerProps = (*arg_value)->props_; - kernel->set_kernel_arg(arg_index, sizeof(pi_uint32), (void *)&samplerProps); - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - pi_result hip_piEnqueueKernelLaunch( pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, @@ -2109,8 +2050,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextEnqueueReadHostPipe, hip_piextEnqueueReadHostPipe) _PI_CL(piextEnqueueWriteHostPipe, hip_piextEnqueueWriteHostPipe) - _PI_CL(piextKernelSetArgMemObj, hip_piextKernelSetArgMemObj) - _PI_CL(piextKernelSetArgSampler, hip_piextKernelSetArgSampler) + _PI_CL(piextKernelSetArgMemObj, pi2ur::piextKernelSetArgMemObj) + _PI_CL(piextKernelSetArgSampler, pi2ur::piextKernelSetArgSampler) _PI_CL(piPluginGetLastError, hip_piPluginGetLastError) _PI_CL(piTearDown, pi2ur::piTearDown) _PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp index 40dba4a782d75..4c69ad3a37962 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp @@ -7,6 +7,8 @@ //===-----------------------------------------------------------------===// #include "kernel.hpp" +#include "memory.hpp" +#include "sampler.hpp" UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, @@ -278,6 +280,57 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( + ur_kernel_handle_t hKernel, uint32_t argIndex, ur_mem_handle_t hArgValue) { + + UR_ASSERT(hKernel != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hArgValue != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + try { + if (hArgValue->mem_type_ == ur_mem_handle_t_::mem_type::surface) { + auto array = hArgValue->mem_.surface_mem_.get_array(); + hipArray_Format Format; + size_t NumChannels; + getArrayDesc(array, Format, NumChannels); + if (Format != HIP_AD_FORMAT_UNSIGNED_INT32 && + Format != HIP_AD_FORMAT_SIGNED_INT32 && + Format != HIP_AD_FORMAT_HALF && Format != HIP_AD_FORMAT_FLOAT) { + sycl::detail::ur::die( + "UR HIP kernels only support images with channel types int32, " + "uint32, float, and half."); + } + hipSurfaceObject_t hipSurf = hArgValue->mem_.surface_mem_.get_surface(); + hKernel->set_kernel_arg(argIndex, sizeof(hipSurf), (void *)&hipSurf); + } else + + { + void *hipPtr = hArgValue->mem_.buffer_mem_.get_void(); + hKernel->set_kernel_arg(argIndex, sizeof(void *), (void *)&hipPtr); + } + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, + ur_sampler_handle_t hArgValue) { + + UR_ASSERT(hKernel != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hArgValue != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + try { + uint32_t samplerProps = hArgValue->props_; + hKernel->set_kernel_arg(argIndex, sizeof(uint32_t), (void *)&samplerProps); + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + // A NOP for the HIP backend UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index 2fd5ffc9d0cf7..9d103052fddb0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -115,9 +115,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnRelease = urKernelRelease; pDdiTable->pfnRetain = urKernelRetain; pDdiTable->pfnSetArgLocal = nullptr; - pDdiTable->pfnSetArgMemObj = nullptr; + pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj; pDdiTable->pfnSetArgPointer = urKernelSetArgPointer; - pDdiTable->pfnSetArgSampler = nullptr; + pDdiTable->pfnSetArgSampler = urKernelSetArgSampler; pDdiTable->pfnSetArgValue = urKernelSetArgValue; pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = nullptr; From 379100e81d7a3639973a81c31ecd2776038cb10e Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Fri, 19 May 2023 12:58:25 +0100 Subject: [PATCH 11/42] [SYCL][HIP][UR] Rebase pi2ur to cuda branch and remove ur_object --- sycl/plugins/unified_runtime/pi2ur.hpp | 724 ++++++++++++++++-- .../ur/adapters/hip/kernel.hpp | 2 +- .../ur/adapters/hip/program.hpp | 2 +- .../ur/adapters/hip/sampler.hpp | 2 +- sycl/plugins/unified_runtime/ur/ur.hpp | 13 - 5 files changed, 670 insertions(+), 73 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index f5ad4494e929d..432baa3224f31 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -217,40 +217,6 @@ inline pi_result ur2piPlatformInfoValue(ur_platform_info_t ParamName, (int)ParamValueSizePI, (int)*ParamValueSizeUR); die("ur2piPlatformInfoValue: size mismatch"); } - - return PI_SUCCESS; -} - -// Handle mismatched PI and UR type return sizes for info queries -inline pi_result fixupInfoValueTypes(size_t ParamValueSizeRetUR, - size_t *ParamValueSizeRetPI, - size_t ParamValueSize, void *ParamValue) { - if (ParamValueSizeRetUR == 1 && ParamValueSize == 4) { - // extend bool to pi_bool (uint32_t) - if (ParamValue) { - auto *ValIn = static_cast(ParamValue); - auto *ValOut = static_cast(ParamValue); - *ValOut = static_cast(*ValIn); - } - if (ParamValueSizeRetPI) { - *ParamValueSizeRetPI = sizeof(pi_bool); - } - } - - return PI_SUCCESS; -} - -template -inline pi_result -ConvertInputBitfield(pi_bitfield in, TypeOut *out, - const std::unordered_map &map) { - *out = 0; - for (auto &[FlagPI, FlagUR] : map) { - if (in & FlagPI) { - *out |= FlagUR; - } - } - return PI_SUCCESS; } @@ -633,7 +599,13 @@ inline pi_result piPlatformGetInfo(pi_platform Platform, HANDLE_ERRORS(urPlatformGetInfo(UrPlatform, UrParamName, ParamValueSize, ParamValue, &UrParamValueSizeRet)); - ur2piPlatformInfoValue(InfoType, ParamValueSize, &SizeInOut, ParamValue); + if (ParamValueSizeRet) { + *ParamValueSizeRet = UrParamValueSizeRet; + } + ur2piPlatformInfoValue(UrParamName, ParamValueSize, &ParamValueSize, + ParamValue); + fixupInfoValueTypes(UrParamValueSizeRet, ParamValueSizeRet, ParamValueSize, + ParamValue); return PI_SUCCESS; } @@ -1049,14 +1021,17 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, return PI_ERROR_UNKNOWN; }; + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); + size_t UrParamValueSizeRet; - auto hDevice = reinterpret_cast(Device); - HANDLE_ERRORS(urDeviceGetInfo(hDevice, InfoType, ParamValueSize, ParamValue, + auto UrDevice = reinterpret_cast(Device); + + HANDLE_ERRORS(urDeviceGetInfo(UrDevice, InfoType, ParamValueSize, ParamValue, &UrParamValueSizeRet)); + if (ParamValueSizeRet) { *ParamValueSizeRet = UrParamValueSizeRet; } - ur2piDeviceInfoValue(InfoType, ParamValueSize, &ParamValueSize, ParamValue); fixupInfoValueTypes(UrParamValueSizeRet, ParamValueSizeRet, ParamValueSize, ParamValue); @@ -1250,17 +1225,7 @@ inline pi_result piextContextSetExtendedDeleter( pi_context Context, pi_context_extended_deleter Function, void *UserData) { auto hContext = reinterpret_cast(Context); - size_t UrParamValueSizeRet; - auto hContext = reinterpret_cast(context); - HANDLE_ERRORS(urContextGetInfo(hContext, InfoType->second, param_value_size, - param_value, &UrParamValueSizeRet)); - - if (param_value_size_ret) { - *param_value_size_ret = UrParamValueSizeRet; - } - - fixupInfoValueTypes(UrParamValueSizeRet, param_value_size_ret, - param_value_size, param_value); + HANDLE_ERRORS(urContextSetExtendedDeleter(hContext, Function, UserData)); return PI_SUCCESS; } @@ -2889,15 +2854,660 @@ inline pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context, PI_ERROR_INVALID_VALUE); } - size_t UrParamValueSizeRet; - auto hSampler = reinterpret_cast(Sampler); - HANDLE_ERRORS(urSamplerGetInfo(hSampler, InfoType->second, ParamValueSize, - ParamValue, &UrParamValueSizeRet)); - if (ParamValueSizeRet) { - *ParamValueSizeRet = UrParamValueSizeRet; + ur_context_handle_t UrContext = + reinterpret_cast(Context); + auto UrDevice = reinterpret_cast(Device); + + ur_usm_desc_t USMDesc{}; + ur_usm_device_desc_t UsmDeviceDesc{}; + UsmDeviceDesc.stype = UR_STRUCTURE_TYPE_USM_DEVICE_DESC; + ur_usm_host_desc_t UsmHostDesc{}; + UsmHostDesc.stype = UR_STRUCTURE_TYPE_USM_HOST_DESC; + if (Properties) { + if (Properties[0] == PI_MEM_ALLOC_FLAGS) { + if (Properties[1] == PI_MEM_ALLOC_WRTITE_COMBINED) { + UsmDeviceDesc.flags |= UR_USM_DEVICE_MEM_FLAG_WRITE_COMBINED; + } + if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE) { + UsmDeviceDesc.flags |= UR_USM_DEVICE_MEM_FLAG_INITIAL_PLACEMENT; + } + if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_HOST) { + UsmHostDesc.flags |= UR_USM_HOST_MEM_FLAG_INITIAL_PLACEMENT; + } + if (Properties[1] == PI_MEM_ALLOC_DEVICE_READ_ONLY) { + UsmDeviceDesc.flags |= UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY; + } + } } - fixupInfoValueTypes(UrParamValueSizeRet, ParamValueSizeRet, ParamValueSize, - ParamValue); + UsmDeviceDesc.pNext = &UsmHostDesc; + USMDesc.pNext = &UsmDeviceDesc; + + USMDesc.align = Alignment; + + ur_usm_pool_handle_t Pool{}; + HANDLE_ERRORS( + urUSMSharedAlloc(UrContext, UrDevice, &USMDesc, Pool, Size, ResultPtr)); + + return PI_SUCCESS; +} + +inline pi_result piextUSMFree(pi_context Context, void *Ptr) { + ur_context_handle_t UrContext = + reinterpret_cast(Context); + HANDLE_ERRORS(urUSMFree(UrContext, Ptr)); + return PI_SUCCESS; +} + +inline pi_result piMemRetain(pi_mem Mem) { + PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); + + ur_mem_handle_t UrMem = reinterpret_cast(Mem); + + HANDLE_ERRORS(urMemRetain(UrMem)); + + return PI_SUCCESS; +} + +inline pi_result piMemRelease(pi_mem Mem) { + PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); + + ur_mem_handle_t UrMem = reinterpret_cast(Mem); + + HANDLE_ERRORS(urMemRelease(UrMem)); + + return PI_SUCCESS; +} + +/// Hint to migrate memory to the device +/// +/// @param Queue is the queue to submit to +/// @param Ptr points to the memory to migrate +/// @param Size is the number of bytes to migrate +/// @param Flags is a bitfield used to specify memory migration options +/// @param NumEventsInWaitList is the number of events to wait on +/// @param EventsWaitList is an array of events to wait on +/// @param Event is the event that represents this operation +inline pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, + size_t Size, + pi_usm_migration_flags Flags, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + // flags is currently unused so fail if set + PI_ASSERT(Flags == 0, PI_ERROR_INVALID_VALUE); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + // TODO: to map from pi_usm_migration_flags to + // ur_usm_migration_flags_t + // once we have those defined + ur_usm_migration_flags_t UrFlags{}; + HANDLE_ERRORS(urEnqueueUSMPrefetch(UrQueue, Ptr, Size, UrFlags, + NumEventsInWaitList, UrEventsWaitList, + UrEvent)); + + return PI_SUCCESS; +} + +/// USM memadvise API to govern behavior of automatic migration mechanisms +/// +/// @param Queue is the queue to submit to +/// @param Ptr is the data to be advised +/// @param Length is the size in bytes of the meory to advise +/// @param Advice is device specific advice +/// @param Event is the event that represents this operation +/// +inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, + size_t Length, pi_mem_advice Advice, + pi_event *OutEvent) { + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + ur_usm_advice_flags_t UrAdvice{}; + if (Advice & PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY) { + UrAdvice |= UR_USM_ADVICE_FLAG_SET_READ_MOSTLY; + } + if (Advice & PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY) { + UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY; + } + if (Advice & PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION) { + UrAdvice |= UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION; + } + if (Advice & PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION) { + UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION; + } + if (Advice & PI_MEM_ADVICE_RESET) { + UrAdvice |= UR_USM_ADVICE_FLAG_DEFAULT; + } + + HANDLE_ERRORS(urEnqueueUSMAdvise(UrQueue, Ptr, Length, UrAdvice, UrEvent)); + + return PI_SUCCESS; +} + +/// USM 2D Fill API +/// +/// \param queue is the queue to submit to +/// \param ptr is the ptr to fill +/// \param pitch is the total width of the destination memory including padding +/// \param pattern is a pointer with the bytes of the pattern to set +/// \param pattern_size is the size in bytes of the pattern +/// \param width is width in bytes of each row to fill +/// \param height is height the columns to fill +/// \param num_events_in_waitlist is the number of events to wait on +/// \param events_waitlist is an array of events to wait on +/// \param event is the event that represents this operation +inline pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr, size_t Pitch, + size_t PatternSize, const void *Pattern, + size_t Width, size_t Height, + pi_uint32 NumEventsWaitList, + const pi_event *EventsWaitList, + pi_event *Event) { + + auto hQueue = reinterpret_cast(Queue); + auto phEventWaitList = + reinterpret_cast(EventsWaitList); + auto phEvent = reinterpret_cast(Event); + + HANDLE_ERRORS(urEnqueueUSMFill2D(hQueue, Ptr, Pitch, PatternSize, Pattern, + Width, Height, NumEventsWaitList, + phEventWaitList, phEvent)); + + return PI_SUCCESS; +} + +inline pi_result piextUSMEnqueueMemset2D(pi_queue Queue, void *Ptr, + size_t Pitch, int Value, size_t Width, + size_t Height, + pi_uint32 NumEventsWaitList, + const pi_event *EventsWaitList, + pi_event *Event) { + std::ignore = Queue; + std::ignore = Ptr; + std::ignore = Pitch; + std::ignore = Value; + std::ignore = Width; + std::ignore = Height; + std::ignore = NumEventsWaitList; + std::ignore = EventsWaitList; + std::ignore = Event; + die("piextUSMEnqueueMemset2D: not implemented"); + return PI_SUCCESS; +} + +inline pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr, + pi_mem_alloc_info ParamName, + size_t ParamValueSize, + void *ParamValue, + size_t *ParamValueSizeRet) { + + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + ur_usm_alloc_info_t UrParamName{}; + switch (ParamName) { + case PI_MEM_ALLOC_TYPE: { + UrParamName = UR_USM_ALLOC_INFO_TYPE; + break; + } + case PI_MEM_ALLOC_BASE_PTR: { + UrParamName = UR_USM_ALLOC_INFO_BASE_PTR; + break; + } + case PI_MEM_ALLOC_SIZE: { + UrParamName = UR_USM_ALLOC_INFO_SIZE; + break; + } + case PI_MEM_ALLOC_DEVICE: { + UrParamName = UR_USM_ALLOC_INFO_DEVICE; + break; + } + default: { + die("piextUSMGetMemAllocInfo: unsuppported ParamName."); + } + } + + size_t SizeInOut = ParamValueSize; + HANDLE_ERRORS(urUSMGetMemAllocInfo(UrContext, Ptr, UrParamName, + ParamValueSize, ParamValue, + ParamValueSizeRet)) + ur2piUSMAllocInfoValue(UrParamName, ParamValueSize, &SizeInOut, ParamValue); + return PI_SUCCESS; +} + +inline pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + auto hMem = reinterpret_cast(Image); + + ur_image_info_t UrParamName{}; + switch (ParamName) { + case PI_IMAGE_INFO_FORMAT: { + UrParamName = UR_IMAGE_INFO_FORMAT; + break; + } + case PI_IMAGE_INFO_ELEMENT_SIZE: { + UrParamName = UR_IMAGE_INFO_ELEMENT_SIZE; + break; + } + case PI_IMAGE_INFO_ROW_PITCH: { + UrParamName = UR_IMAGE_INFO_ROW_PITCH; + break; + } + case PI_IMAGE_INFO_SLICE_PITCH: { + UrParamName = UR_IMAGE_INFO_SLICE_PITCH; + break; + } + case PI_IMAGE_INFO_WIDTH: { + UrParamName = UR_IMAGE_INFO_WIDTH; + break; + } + case PI_IMAGE_INFO_HEIGHT: { + UrParamName = UR_IMAGE_INFO_HEIGHT; + break; + } + case PI_IMAGE_INFO_DEPTH: { + UrParamName = UR_IMAGE_INFO_DEPTH; + break; + } + default: + return PI_ERROR_UNKNOWN; + } + + HANDLE_ERRORS(urMemImageGetInfo(hMem, UrParamName, ParamValueSize, ParamValue, + ParamValueSizeRet)); + return PI_SUCCESS; +} + +/// USM 2D Memcpy API +/// +/// \param queue is the queue to submit to +/// \param blocking is whether this operation should block the host +/// \param dst_ptr is the location the data will be copied +/// \param dst_pitch is the total width of the destination memory including +/// padding +/// \param src_ptr is the data to be copied +/// \param dst_pitch is the total width of the source memory including padding +/// \param width is width in bytes of each row to be copied +/// \param height is height the columns to be copied +/// \param num_events_in_waitlist is the number of events to wait on +/// \param events_waitlist is an array of events to wait on +/// \param event is the event that represents this operation +inline pi_result piextUSMEnqueueMemcpy2D(pi_queue Queue, pi_bool Blocking, + void *DstPtr, size_t DstPitch, + const void *SrcPtr, size_t SrcPitch, + size_t Width, size_t Height, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + if (!DstPtr || !SrcPtr) + return PI_ERROR_INVALID_VALUE; + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueUSMMemcpy2D( + UrQueue, Blocking, DstPtr, DstPitch, SrcPtr, SrcPitch, Width, Height, + NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +// Memory +/////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +// Enqueue + +inline pi_result +piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + + PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueKernelLaunch( + UrQueue, UrKernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, + LocalWorkSize, NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result +piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, pi_bool BlockingWrite, + pi_image_offset Origin, pi_image_region Region, + size_t InputRowPitch, size_t InputSlicePitch, + const void *Ptr, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrImage = reinterpret_cast(Image); + ur_rect_offset_t UrOrigin{Origin->x, Origin->y, Origin->z}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth; + UrRegion.height = Region->height; + UrRegion.width = Region->width; + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemImageWrite( + UrQueue, UrImage, BlockingWrite, UrOrigin, UrRegion, InputRowPitch, + InputSlicePitch, const_cast(Ptr), NumEventsInWaitList, + UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result +piEnqueueMemImageRead(pi_queue Queue, pi_mem Image, pi_bool BlockingRead, + pi_image_offset Origin, pi_image_region Region, + size_t RowPitch, size_t SlicePitch, void *Ptr, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrImage = reinterpret_cast(Image); + ur_rect_offset_t UrOrigin{Origin->x, Origin->y, Origin->z}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth; + UrRegion.height = Region->height; + UrRegion.width = Region->width; + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemImageRead( + UrQueue, UrImage, BlockingRead, UrOrigin, UrRegion, RowPitch, SlicePitch, + Ptr, NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferMap( + pi_queue Queue, pi_mem Mem, pi_bool BlockingMap, pi_map_flags MapFlags, + size_t Offset, size_t Size, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent, void **RetMap) { + // TODO: we don't implement read-only or write-only, always read-write. + // assert((map_flags & PI_MAP_READ) != 0); + // assert((map_flags & PI_MAP_WRITE) != 0); + PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrMem = reinterpret_cast(Mem); + + ur_map_flags_t UrMapFlags{}; + if (MapFlags & PI_MAP_READ) + UrMapFlags |= UR_MAP_FLAG_READ; + if (MapFlags & PI_MAP_WRITE) + UrMapFlags |= UR_MAP_FLAG_WRITE; + if (MapFlags & PI_MAP_WRITE_INVALIDATE_REGION) + UrMapFlags |= UR_MAP_FLAG_WRITE_INVALIDATE_REGION; + + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferMap(UrQueue, UrMem, BlockingMap, UrMapFlags, + Offset, Size, NumEventsInWaitList, + UrEventsWaitList, UrEvent, RetMap)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem, void *MappedPtr, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrMem = reinterpret_cast(Mem); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemUnmap(UrQueue, UrMem, MappedPtr, + NumEventsInWaitList, UrEventsWaitList, + UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer, + const void *Pattern, size_t PatternSize, + size_t Offset, size_t Size, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferFill(UrQueue, UrBuffer, Pattern, PatternSize, + Offset, Size, NumEventsInWaitList, + UrEventsWaitList, UrEvent)); + return PI_SUCCESS; +} + +inline pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, + pi_int32 Value, size_t Count, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + if (!Ptr) { + return PI_ERROR_INVALID_VALUE; + } + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + size_t PatternSize = 1; + HANDLE_ERRORS(urEnqueueUSMFill(UrQueue, Ptr, PatternSize, &Value, Count, + NumEventsInWaitList, UrEventsWaitList, + UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferCopyRect( + pi_queue Queue, pi_mem SrcMem, pi_mem DstMem, pi_buff_rect_offset SrcOrigin, + pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region, + size_t SrcRowPitch, size_t SrcSlicePitch, size_t DstRowPitch, + size_t DstSlicePitch, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + + PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBufferSrc = reinterpret_cast(SrcMem); + ur_mem_handle_t UrBufferDst = reinterpret_cast(DstMem); + ur_rect_offset_t UrSrcOrigin{SrcOrigin->x_bytes, SrcOrigin->y_scalar, + SrcOrigin->z_scalar}; + ur_rect_offset_t UrDstOrigin{DstOrigin->x_bytes, DstOrigin->y_scalar, + DstOrigin->z_scalar}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth_scalar; + UrRegion.height = Region->height_scalar; + UrRegion.width = Region->width_bytes; + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferCopyRect( + UrQueue, UrBufferSrc, UrBufferDst, UrSrcOrigin, UrDstOrigin, UrRegion, + SrcRowPitch, SrcSlicePitch, DstRowPitch, DstSlicePitch, + NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcMem, + pi_mem DstMem, size_t SrcOffset, + size_t DstOffset, size_t Size, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBufferSrc = reinterpret_cast(SrcMem); + ur_mem_handle_t UrBufferDst = reinterpret_cast(DstMem); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferCopy( + UrQueue, UrBufferSrc, UrBufferDst, SrcOffset, DstOffset, Size, + NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, + void *DstPtr, const void *SrcPtr, + size_t Size, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueUSMMemcpy(UrQueue, Blocking, DstPtr, SrcPtr, Size, + NumEventsInWaitList, UrEventsWaitList, + UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferWriteRect( + pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr, + pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList, + pi_event *OutEvent) { + + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + ur_rect_offset_t UrBufferOffset{BufferOffset->x_bytes, BufferOffset->y_scalar, + BufferOffset->z_scalar}; + ur_rect_offset_t UrHostOffset{HostOffset->x_bytes, HostOffset->y_scalar, + HostOffset->z_scalar}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth_scalar; + UrRegion.height = Region->height_scalar; + UrRegion.width = Region->width_bytes; + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferWriteRect( + UrQueue, UrBuffer, BlockingWrite, UrBufferOffset, UrHostOffset, UrRegion, + BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, + const_cast(Ptr), NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer, + pi_bool BlockingWrite, size_t Offset, + size_t Size, const void *Ptr, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferWrite( + UrQueue, UrBuffer, BlockingWrite, Offset, Size, const_cast(Ptr), + NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferReadRect( + pi_queue Queue, pi_mem Buffer, pi_bool BlockingRead, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, void *Ptr, + pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList, + pi_event *OutEvent) { + + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); ur_queue_handle_t UrQueue = reinterpret_cast(Queue); ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp index 2fdc79da2de29..d2fb5f6be288c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp @@ -15,7 +15,7 @@ #include "program.hpp" -struct ur_kernel_handle_t_ : _ur_object { +struct ur_kernel_handle_t_ { using native_type = hipFunction_t; native_type function_; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp index d84f888c755d0..aa1f3a4657df7 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp @@ -13,7 +13,7 @@ #include "context.hpp" -struct ur_program_handle_t_ : _ur_object { +struct ur_program_handle_t_ { using native_type = hipModule_t; native_type module_; const char *binary_; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.hpp index 3d0a3059e61fa..6b60092292ed2 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.hpp @@ -15,7 +15,7 @@ /// Sampler property layout: /// | 31 30 ... 6 5 | 4 3 2 | 1 | 0 | /// | N/A | addressing mode | fiter mode | normalize coords | -struct ur_sampler_handle_t_ : _ur_object { +struct ur_sampler_handle_t_ { std::atomic_uint32_t refCount_; uint32_t props_; ur_context_handle_t context_; diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index 5777ddf0da99e..2099b31529176 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -269,10 +269,6 @@ getInfo(size_t param_value_size, void *param_value, } } // namespace ur -// FIXME: This class will cause failures in the UR CTS tests as it is used in UR -// getInfo entry-points, this should be okay for now to make sycl-rt works -// correctly with the existing PI layer. But, it should be deleted once the PI -// layer is completely ported to UR and deleted. class UrReturnHelper { public: UrReturnHelper(size_t param_value_size, void *param_value, @@ -309,12 +305,3 @@ class UrReturnHelper { void *param_value; size_t *param_value_size_ret; }; - -// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR -constexpr size_t MaxMessageSize = 256; -extern thread_local ur_result_t ErrorMessageCode; -extern thread_local char ErrorMessage[MaxMessageSize]; - -// Utility function for setting a message and warning -[[maybe_unused]] void setErrorMessage(const char *message, - ur_result_t error_code); From 2870280d43af204407365205236b8cef020d221d Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Fri, 19 May 2023 11:48:55 +0100 Subject: [PATCH 12/42] [SYCL][HIP][UR] Port urPlatformGetBackendOption and urGetLastResult --- sycl/plugins/hip/pi_hip.cpp | 28 ++----------------- .../ur/adapters/hip/platform.cpp | 20 +++++++++++++ .../ur/adapters/hip/ur_interface_loader.cpp | 3 +- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index acf8f47d864af..8a1fee2c464f3 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -163,30 +163,6 @@ thread_local char ErrorMessage[MaxMessageSize]; ErrorMessageCode = error_code; } -// Returns plugin specific error and warning messages -pi_result hip_piPluginGetLastError(char **message) { - *message = &ErrorMessage[0]; - return ErrorMessageCode; -} - -// Returns plugin specific backend option. -// Current support is only for optimization options. -// Return empty string for hip. -// TODO: Determine correct string to be passed. -pi_result hip_piPluginGetBackendOption(pi_platform, const char *frontend_option, - const char **backend_option) { - using namespace std::literals; - if (frontend_option == nullptr) - return PI_ERROR_INVALID_VALUE; - if (frontend_option == "-O0"sv || frontend_option == "-O1"sv || - frontend_option == "-O2"sv || frontend_option == "-O3"sv || - frontend_option == ""sv) { - *backend_option = ""; - return PI_SUCCESS; - } - return PI_ERROR_INVALID_VALUE; -} - /// Converts HIP error into PI error codes, and outputs error information /// to stderr. /// If PI_HIP_ABORT env variable is defined, it aborts directly instead of @@ -2052,10 +2028,10 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextKernelSetArgMemObj, pi2ur::piextKernelSetArgMemObj) _PI_CL(piextKernelSetArgSampler, pi2ur::piextKernelSetArgSampler) - _PI_CL(piPluginGetLastError, hip_piPluginGetLastError) + _PI_CL(piPluginGetLastError, pi2ur::piPluginGetLastError) _PI_CL(piTearDown, pi2ur::piTearDown) _PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer) - _PI_CL(piPluginGetBackendOption, hip_piPluginGetBackendOption) + _PI_CL(piPluginGetBackendOption, pi2ur::piPluginGetBackendOption) #undef _PI_CL diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp index 1cc2c098e4a62..8cd9bda305cb4 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp @@ -139,3 +139,23 @@ UR_DLLEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t) { UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) { return UR_RESULT_SUCCESS; } + +// Returns plugin specific backend option. +// Current support is only for optimization options. +// Return empty string for cuda. +// TODO: Determine correct string to be passed. +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption( + ur_platform_handle_t hPlatform, const char *pFrontendOption, + const char **ppPlatformOption) { + (void)hPlatform; + using namespace std::literals; + if (pFrontendOption == nullptr) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + if (pFrontendOption == "-O0"sv || pFrontendOption == "-O1"sv || + pFrontendOption == "-O2"sv || pFrontendOption == "-O3"sv || + pFrontendOption == ""sv) { + *ppPlatformOption = ""; + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_INVALID_VALUE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index 9d103052fddb0..89ab252979488 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -41,6 +41,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion; pDdiTable->pfnGetInfo = urPlatformGetInfo; pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetBackendOption = urPlatformGetBackendOption; return UR_RESULT_SUCCESS; } @@ -197,7 +198,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnGetLastResult = nullptr; + pDdiTable->pfnGetLastResult = urGetLastResult; pDdiTable->pfnInit = urInit; pDdiTable->pfnTearDown = urTearDown; return UR_RESULT_SUCCESS; From 8a101a18eab49b88a36f4f75c77154db64550440 Mon Sep 17 00:00:00 2001 From: Petr Vesely Date: Thu, 18 May 2023 18:33:20 +0100 Subject: [PATCH 13/42] [SYLC][PI][HIP][UR] Port enqueue entry points --- sycl/plugins/hip/CMakeLists.txt | 1 + sycl/plugins/hip/pi_hip.cpp | 1784 +---------------- sycl/plugins/hip/pi_hip.hpp | 59 - sycl/plugins/unified_runtime/CMakeLists.txt | 1 + .../ur/adapters/hip/enqueue.cpp | 1532 ++++++++++++++ 5 files changed, 1563 insertions(+), 1814 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt index f90d510ecb7f5..bd354b10ca91f 100644 --- a/sycl/plugins/hip/CMakeLists.txt +++ b/sycl/plugins/hip/CMakeLists.txt @@ -98,6 +98,7 @@ add_sycl_plugin(hip "../unified_runtime/ur/adapters/hip/context.hpp" "../unified_runtime/ur/adapters/hip/device.cpp" "../unified_runtime/ur/adapters/hip/device.hpp" + "../unified_runtime/ur/adapters/hip/enqueue.cpp" "../unified_runtime/ur/adapters/hip/event.cpp" "../unified_runtime/ur/adapters/hip/event.hpp" "../unified_runtime/ur/adapters/hip/platform.cpp" diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 8a1fee2c464f3..3873c7910c5b4 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -47,109 +47,6 @@ pi_result map_error(hipError_t result) { } } -// TODO(ur) - this can be removed once more of pi entry points are ported to UR. -pi_result map_ur_error(ur_result_t result) { - - switch (result) { -#define CASE(UR_ERR, PI_ERR) \ - case UR_ERR: \ - return PI_ERR; - - CASE(UR_RESULT_SUCCESS, PI_SUCCESS) - CASE(UR_RESULT_ERROR_INVALID_OPERATION, PI_ERROR_INVALID_OPERATION) - CASE(UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES, - PI_ERROR_INVALID_QUEUE_PROPERTIES) - CASE(UR_RESULT_ERROR_INVALID_QUEUE, PI_ERROR_INVALID_QUEUE) - CASE(UR_RESULT_ERROR_INVALID_VALUE, PI_ERROR_INVALID_VALUE) - CASE(UR_RESULT_ERROR_INVALID_CONTEXT, PI_ERROR_INVALID_CONTEXT) - CASE(UR_RESULT_ERROR_INVALID_PLATFORM, PI_ERROR_INVALID_PLATFORM) - CASE(UR_RESULT_ERROR_INVALID_BINARY, PI_ERROR_INVALID_BINARY) - CASE(UR_RESULT_ERROR_INVALID_PROGRAM, PI_ERROR_INVALID_BINARY) - CASE(UR_RESULT_ERROR_INVALID_SAMPLER, PI_ERROR_INVALID_SAMPLER) - CASE(UR_RESULT_ERROR_INVALID_BUFFER_SIZE, PI_ERROR_INVALID_BUFFER_SIZE) - CASE(UR_RESULT_ERROR_INVALID_MEM_OBJECT, PI_ERROR_INVALID_MEM_OBJECT) - CASE(UR_RESULT_ERROR_INVALID_EVENT, PI_ERROR_INVALID_EVENT) - CASE(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST, - PI_ERROR_INVALID_EVENT_WAIT_LIST) - CASE(UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET, - PI_ERROR_MISALIGNED_SUB_BUFFER_OFFSET) - CASE(UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE, - PI_ERROR_INVALID_WORK_GROUP_SIZE) - CASE(UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE, - PI_ERROR_COMPILER_NOT_AVAILABLE) - CASE(UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE, - PI_ERROR_PROFILING_INFO_NOT_AVAILABLE) - CASE(UR_RESULT_ERROR_DEVICE_NOT_FOUND, PI_ERROR_DEVICE_NOT_FOUND) - CASE(UR_RESULT_ERROR_INVALID_DEVICE, PI_ERROR_INVALID_DEVICE) - CASE(UR_RESULT_ERROR_DEVICE_LOST, PI_ERROR_DEVICE_NOT_AVAILABLE) - // UR_RESULT_ERROR_DEVICE_REQUIRES_RESET - // UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE - CASE(UR_RESULT_ERROR_DEVICE_PARTITION_FAILED, - PI_ERROR_DEVICE_PARTITION_FAILED) - CASE(UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT, - PI_ERROR_INVALID_DEVICE_PARTITION_COUNT) - CASE(UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE, - PI_ERROR_INVALID_WORK_ITEM_SIZE) - CASE(UR_RESULT_ERROR_INVALID_WORK_DIMENSION, - PI_ERROR_INVALID_WORK_DIMENSION) - CASE(UR_RESULT_ERROR_INVALID_KERNEL_ARGS, PI_ERROR_INVALID_KERNEL_ARGS) - CASE(UR_RESULT_ERROR_INVALID_KERNEL, PI_ERROR_INVALID_KERNEL) - CASE(UR_RESULT_ERROR_INVALID_KERNEL_NAME, PI_ERROR_INVALID_KERNEL_NAME) - CASE(UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX, - PI_ERROR_INVALID_ARG_INDEX) - CASE(UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE, - PI_ERROR_INVALID_ARG_SIZE) - // UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE - CASE(UR_RESULT_ERROR_INVALID_IMAGE_SIZE, PI_ERROR_INVALID_IMAGE_SIZE) - CASE(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, - PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR) - CASE(UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED, - PI_ERROR_IMAGE_FORMAT_NOT_SUPPORTED) - CASE(UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE, - PI_ERROR_MEM_OBJECT_ALLOCATION_FAILURE) - CASE(UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE, - PI_ERROR_INVALID_PROGRAM_EXECUTABLE) - CASE(UR_RESULT_ERROR_UNINITIALIZED, PI_ERROR_UNINITIALIZED) - CASE(UR_RESULT_ERROR_OUT_OF_HOST_MEMORY, PI_ERROR_OUT_OF_HOST_MEMORY) - CASE(UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, PI_ERROR_OUT_OF_RESOURCES) - CASE(UR_RESULT_ERROR_OUT_OF_RESOURCES, PI_ERROR_OUT_OF_RESOURCES) - CASE(UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE, PI_ERROR_BUILD_PROGRAM_FAILURE) - CASE(UR_RESULT_ERROR_PROGRAM_LINK_FAILURE, PI_ERROR_LINK_PROGRAM_FAILURE) - // UR_RESULT_ERROR_UNSUPPORTED_VERSION - // UR_RESULT_ERROR_UNSUPPORTED_FEATURE - CASE(UR_RESULT_ERROR_INVALID_ARGUMENT, PI_ERROR_INVALID_ARG_VALUE) - CASE(UR_RESULT_ERROR_INVALID_NULL_HANDLE, PI_ERROR_INVALID_VALUE) - // UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE - CASE(UR_RESULT_ERROR_INVALID_NULL_POINTER, PI_ERROR_INVALID_VALUE) - CASE(UR_RESULT_ERROR_INVALID_SIZE, PI_ERROR_INVALID_VALUE) - CASE(UR_RESULT_ERROR_UNSUPPORTED_SIZE, PI_ERROR_INVALID_VALUE) - CASE(UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, PI_ERROR_INVALID_VALUE) - CASE(UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT, PI_ERROR_INVALID_VALUE) - CASE(UR_RESULT_ERROR_INVALID_ENUMERATION, PI_ERROR_INVALID_VALUE) - CASE(UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION, PI_ERROR_INVALID_VALUE) - CASE(UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT, - PI_ERROR_IMAGE_FORMAT_MISMATCH) - CASE(UR_RESULT_ERROR_INVALID_NATIVE_BINARY, PI_ERROR_INVALID_BINARY) - CASE(UR_RESULT_ERROR_INVALID_GLOBAL_NAME, PI_ERROR_INVALID_VALUE) - CASE(UR_RESULT_ERROR_INVALID_FUNCTION_NAME, PI_ERROR_INVALID_VALUE) - CASE(UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION, - PI_ERROR_INVALID_WORK_GROUP_SIZE) - CASE(UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION, - PI_ERROR_INVALID_GLOBAL_WORK_SIZE) - // UR_RESULT_ERROR_PROGRAM_UNLINKED - // UR_RESULT_ERROR_OVERLAPPING_REGIONS - CASE(UR_RESULT_ERROR_INVALID_HOST_PTR, PI_ERROR_INVALID_HOST_PTR) - // UR_RESULT_ERROR_INVALID_USM_SIZE - CASE(UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE, - PI_ERROR_MEM_OBJECT_ALLOCATION_FAILURE) - CASE(UR_RESULT_ERROR_ADAPTER_SPECIFIC, PI_ERROR_PLUGIN_SPECIFIC_ERROR) - -#undef CASE - default: - return PI_ERROR_UNKNOWN; - } -} - // Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR constexpr size_t MaxMessageSize = 256; thread_local pi_result ErrorMessageCode = PI_SUCCESS; @@ -202,119 +99,6 @@ pi_result check_error(hipError_t result, const char *function, int line, /// \cond NODOXY #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__) -/// \cond NODOXY -template -pi_result getInfoImpl(size_t param_value_size, void *param_value, - size_t *param_value_size_ret, T value, size_t value_size, - Assign &&assign_func) { - - if (param_value != nullptr) { - - if (param_value_size < value_size) { - return PI_ERROR_INVALID_VALUE; - } - - assign_func(param_value, value, value_size); - } - - if (param_value_size_ret != nullptr) { - *param_value_size_ret = value_size; - } - - return PI_SUCCESS; -} - -template -pi_result getInfo(size_t param_value_size, void *param_value, - size_t *param_value_size_ret, T value) { - - auto assignment = [](void *param_value, T value, size_t value_size) { - (void)value_size; - *static_cast(param_value) = value; - }; - - return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, - sizeof(T), std::move(assignment)); -} - -template -pi_result getInfoArray(size_t array_length, size_t param_value_size, - void *param_value, size_t *param_value_size_ret, - T *value) { - - auto assignment = [](void *param_value, T *value, size_t value_size) { - memcpy(param_value, static_cast(value), value_size); - }; - - return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, - array_length * sizeof(T), std::move(assignment)); -} - -template <> -pi_result getInfo(size_t param_value_size, void *param_value, - size_t *param_value_size_ret, - const char *value) { - return getInfoArray(strlen(value) + 1, param_value_size, param_value, - param_value_size_ret, value); -} - -/// \endcond - -void simpleGuessLocalWorkSize(size_t *threadsPerBlock, - const size_t *global_work_size, - const size_t maxThreadsPerBlock[3], - [[maybe_unused]] pi_kernel kernel) { - assert(threadsPerBlock != nullptr); - assert(global_work_size != nullptr); - assert(kernel != nullptr); - // int recommendedBlockSize, minGrid; - - // PI_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize( - // &minGrid, &recommendedBlockSize, kernel->get(), - // 0, 0)); - - //(void)minGrid; // Not used, avoid warnings - - threadsPerBlock[0] = std::min(maxThreadsPerBlock[0], global_work_size[0]); - - // Find a local work group size that is a divisor of the global - // work group size to produce uniform work groups. - while (0u != (global_work_size[0] % threadsPerBlock[0])) { - --threadsPerBlock[0]; - } -} - -pi_result enqueueEventsWait(pi_queue command_queue, hipStream_t stream, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list) { - if (!event_wait_list) { - return PI_SUCCESS; - } - try { - ScopedContext active(command_queue->get_context()); - - auto result = map_ur_error(forLatestEvents( - reinterpret_cast(event_wait_list), - num_events_in_wait_list, - [stream](ur_event_handle_t event) -> ur_result_t { - if (event->get_stream() == stream) { - return UR_RESULT_SUCCESS; - } else { - return UR_CHECK_ERROR(hipStreamWaitEvent(stream, event->get(), 0)); - } - })); - - if (result != PI_SUCCESS) { - return result; - } - return PI_SUCCESS; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } -} - } // anonymous namespace /// ------ Error handling, matching OpenCL plugin semantics. @@ -347,22 +131,6 @@ void assertion(bool Condition, const char *Message) { } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl -//-------------- -// PI object implementation - -extern "C" { - -// Required in a number of functions, so forward declare here -pi_result hip_piEnqueueEventsWait(pi_queue command_queue, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event); -pi_result hip_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event); -} // extern "C" - /// \endcond // makes all future work submitted to queue wait for all work captured in event. @@ -379,1500 +147,6 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { //-- PI API implementation extern "C" { -pi_result hip_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, - pi_bool blocking_write, size_t offset, - size_t size, void *ptr, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - - assert(buffer != nullptr); - assert(command_queue != nullptr); - pi_result retErr = PI_SUCCESS; - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - hipStream_t hipStream = command_queue->get_next_transfer_stream(); - retErr = enqueueEventsWait(command_queue, hipStream, - num_events_in_wait_list, event_wait_list); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue, hipStream)); - retImplEv->start(); - } - - retErr = PI_CHECK_ERROR( - hipMemcpyHtoDAsync(buffer->mem_.buffer_mem_.get_with_offset(offset), - ptr, size, hipStream)); - - if (event) { - retErr = map_ur_error(retImplEv->record()); - } - - if (blocking_write) { - retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream)); - } - - if (event) { - *event = retImplEv.release(); - } - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result hip_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, - pi_bool blocking_read, size_t offset, - size_t size, void *ptr, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - - assert(buffer != nullptr); - assert(command_queue != nullptr); - pi_result retErr = PI_SUCCESS; - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - hipStream_t hipStream = command_queue->get_next_transfer_stream(); - retErr = enqueueEventsWait(command_queue, hipStream, - num_events_in_wait_list, event_wait_list); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue, hipStream)); - retImplEv->start(); - } - - retErr = PI_CHECK_ERROR(hipMemcpyDtoHAsync( - ptr, buffer->mem_.buffer_mem_.get_with_offset(offset), size, - hipStream)); - - if (event) { - retErr = map_ur_error(retImplEv->record()); - } - - if (blocking_read) { - retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream)); - } - - if (event) { - *event = retImplEv.release(); - } - - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result hip_piEnqueueKernelLaunch( - pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim, - const size_t *global_work_offset, const size_t *global_work_size, - const size_t *local_work_size, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - - // Preconditions - assert(command_queue != nullptr); - assert(command_queue->get_context() == kernel->get_context()); - assert(kernel != nullptr); - assert(global_work_offset != nullptr); - assert(work_dim > 0); - assert(work_dim < 4); - - if (*global_work_size == 0) { - return hip_piEnqueueEventsWaitWithBarrier( - command_queue, num_events_in_wait_list, event_wait_list, event); - } - - // Set the number of threads per block to the number of threads per warp - // by default unless user has provided a better number - size_t threadsPerBlock[3] = {32u, 1u, 1u}; - size_t maxWorkGroupSize = 0u; - size_t maxThreadsPerBlock[3] = {}; - bool providedLocalWorkGroupSize = (local_work_size != nullptr); - - { - pi_result retError = pi2ur::piDeviceGetInfo( - reinterpret_cast(command_queue->device_), - PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES, sizeof(maxThreadsPerBlock), - maxThreadsPerBlock, nullptr); - assert(retError == PI_SUCCESS); - (void)retError; - - retError = pi2ur::piDeviceGetInfo( - reinterpret_cast(command_queue->device_), - PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE, sizeof(maxWorkGroupSize), - &maxWorkGroupSize, nullptr); - assert(retError == PI_SUCCESS); - // The maxWorkGroupsSize = 1024 for AMD GPU - // The maxThreadsPerBlock = {1024, 1024, 1024} - - if (providedLocalWorkGroupSize) { - auto isValid = [&](int dim) { - if (local_work_size[dim] > maxThreadsPerBlock[dim]) - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - // Checks that local work sizes are a divisor of the global work sizes - // which includes that the local work sizes are neither larger than the - // global work sizes and not 0. - if (0u == local_work_size[dim]) - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - if (0u != (global_work_size[dim] % local_work_size[dim])) - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - threadsPerBlock[dim] = local_work_size[dim]; - return PI_SUCCESS; - }; - - for (size_t dim = 0; dim < work_dim; dim++) { - auto err = isValid(dim); - if (err != PI_SUCCESS) - return err; - } - } else { - simpleGuessLocalWorkSize(threadsPerBlock, global_work_size, - maxThreadsPerBlock, kernel); - } - } - - if (maxWorkGroupSize < - size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) { - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - } - - size_t blocksPerGrid[3] = {1u, 1u, 1u}; - - for (size_t i = 0; i < work_dim; i++) { - blocksPerGrid[i] = - (global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i]; - } - - pi_result retError = PI_SUCCESS; - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - - pi_uint32 stream_token; - _pi_stream_guard guard; - hipStream_t hipStream = command_queue->get_next_compute_stream( - num_events_in_wait_list, - reinterpret_cast(event_wait_list), guard, - &stream_token); - hipFunction_t hipFunc = kernel->get(); - - retError = enqueueEventsWait(command_queue, hipStream, - num_events_in_wait_list, event_wait_list); - - // Set the implicit global offset parameter if kernel has offset variant - if (kernel->get_with_offset_parameter()) { - std::uint32_t hip_implicit_offset[3] = {0, 0, 0}; - if (global_work_offset) { - for (size_t i = 0; i < work_dim; i++) { - hip_implicit_offset[i] = - static_cast(global_work_offset[i]); - if (global_work_offset[i] != 0) { - hipFunc = kernel->get_with_offset_parameter(); - } - } - } - kernel->set_implicit_offset_arg(sizeof(hip_implicit_offset), - hip_implicit_offset); - } - - auto argIndices = kernel->get_arg_indices(); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_NDRANGE_KERNEL, command_queue, - hipStream, stream_token)); - retImplEv->start(); - } - - // Set local mem max size if env var is present - static const char *local_mem_sz_ptr = - std::getenv("SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE"); - - if (local_mem_sz_ptr) { - int device_max_local_mem = 0; - retError = PI_CHECK_ERROR(hipDeviceGetAttribute( - &device_max_local_mem, hipDeviceAttributeMaxSharedMemoryPerBlock, - command_queue->get_device()->get())); - - static const int env_val = std::atoi(local_mem_sz_ptr); - if (env_val <= 0 || env_val > device_max_local_mem) { - setErrorMessage("Invalid value specified for " - "SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE", - PI_ERROR_PLUGIN_SPECIFIC_ERROR); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - } - retError = PI_CHECK_ERROR(hipFuncSetAttribute( - hipFunc, hipFuncAttributeMaxDynamicSharedMemorySize, env_val)); - } - - retError = PI_CHECK_ERROR(hipModuleLaunchKernel( - hipFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2], - threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], - kernel->get_local_size(), hipStream, argIndices.data(), nullptr)); - - kernel->clear_local_size(); - - if (event) { - retError = map_ur_error(retImplEv->record()); - *event = retImplEv.release(); - } - } catch (pi_result err) { - retError = err; - } - return retError; -} - -/// \TODO Not implemented -pi_result -hip_piEnqueueNativeKernel(pi_queue queue, void (*user_func)(void *), void *args, - size_t cb_args, pi_uint32 num_mem_objects, - const pi_mem *mem_list, const void **args_mem_loc, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - (void)queue; - (void)user_func; - (void)args; - (void)cb_args; - (void)num_mem_objects; - (void)mem_list; - (void)args_mem_loc; - (void)num_events_in_wait_list; - (void)event_wait_list; - (void)event; - - sycl::detail::pi::die("Not implemented in HIP backend"); - return {}; -} - -/// Enqueues a wait on the given queue for all events. -/// See \ref enqueueEventWait -/// -/// Currently queues are represented by a single in-order stream, therefore -/// every command is an implicit barrier and so hip_piEnqueueEventsWait has the -/// same behavior as hip_piEnqueueEventsWaitWithBarrier. So -/// hip_piEnqueueEventsWait can just call hip_piEnqueueEventsWaitWithBarrier. -pi_result hip_piEnqueueEventsWait(pi_queue command_queue, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - return hip_piEnqueueEventsWaitWithBarrier( - command_queue, num_events_in_wait_list, event_wait_list, event); -} - -/// Enqueues a wait on the given queue for all specified events. -/// See \ref enqueueEventWaitWithBarrier -/// -/// If the events list is empty, the enqueued wait will wait on all previous -/// events in the queue. -pi_result hip_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - if (!command_queue) { - return PI_ERROR_INVALID_QUEUE; - } - - pi_result result; - - try { - ScopedContext active(command_queue->get_context()); - pi_uint32 stream_token; - _pi_stream_guard guard; - hipStream_t hipStream = command_queue->get_next_compute_stream( - num_events_in_wait_list, - reinterpret_cast(event_wait_list), guard, - &stream_token); - { - std::lock_guard guard(command_queue->barrier_mutex_); - if (command_queue->barrier_event_ == nullptr) { - PI_CHECK_ERROR(hipEventCreate(&command_queue->barrier_event_)); - } - if (num_events_in_wait_list == 0) { // wait on all work - if (command_queue->barrier_tmp_event_ == nullptr) { - PI_CHECK_ERROR(hipEventCreate(&command_queue->barrier_tmp_event_)); - } - command_queue->sync_streams( - [hipStream, - tmp_event = command_queue->barrier_tmp_event_](hipStream_t s) { - if (hipStream != s) { - PI_CHECK_ERROR(hipEventRecord(tmp_event, s)); - PI_CHECK_ERROR(hipStreamWaitEvent(hipStream, tmp_event, 0)); - } - }); - } else { // wait just on given events - forLatestEvents( - reinterpret_cast(event_wait_list), - num_events_in_wait_list, - [hipStream](ur_event_handle_t event) -> ur_result_t { - if (event->get_queue()->has_been_synchronized( - event->get_compute_stream_token())) { - return UR_RESULT_SUCCESS; - } else { - return UR_CHECK_ERROR( - hipStreamWaitEvent(hipStream, event->get(), 0)); - } - }); - } - - result = PI_CHECK_ERROR( - hipEventRecord(command_queue->barrier_event_, hipStream)); - for (unsigned int i = 0; - i < command_queue->compute_applied_barrier_.size(); i++) { - command_queue->compute_applied_barrier_[i] = false; - } - for (unsigned int i = 0; - i < command_queue->transfer_applied_barrier_.size(); i++) { - command_queue->transfer_applied_barrier_[i] = false; - } - } - if (result != PI_SUCCESS) { - return result; - } - - if (event) { - *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue, - hipStream, stream_token); - (*event)->start(); - (*event)->record(); - } - - return PI_SUCCESS; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } -} - -/// General 3D memory copy operation. -/// This function requires the corresponding HIP context to be at the top of -/// the context stack -/// If the source and/or destination is on the device, src_ptr and/or dst_ptr -/// must be a pointer to a hipDevPtr -static pi_result commonEnqueueMemBufferCopyRect( - hipStream_t hip_stream, pi_buff_rect_region region, const void *src_ptr, - const hipMemoryType src_type, pi_buff_rect_offset src_offset, - size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr, - const hipMemoryType dst_type, pi_buff_rect_offset dst_offset, - size_t dst_row_pitch, size_t dst_slice_pitch) { - - assert(region != nullptr); - assert(src_offset != nullptr); - assert(dst_offset != nullptr); - - assert(src_type == hipMemoryTypeDevice || src_type == hipMemoryTypeHost); - assert(dst_type == hipMemoryTypeDevice || dst_type == hipMemoryTypeHost); - - src_row_pitch = (!src_row_pitch) ? region->width_bytes : src_row_pitch; - src_slice_pitch = (!src_slice_pitch) ? (region->height_scalar * src_row_pitch) - : src_slice_pitch; - dst_row_pitch = (!dst_row_pitch) ? region->width_bytes : dst_row_pitch; - dst_slice_pitch = (!dst_slice_pitch) ? (region->height_scalar * dst_row_pitch) - : dst_slice_pitch; - - HIP_MEMCPY3D params; - - params.WidthInBytes = region->width_bytes; - params.Height = region->height_scalar; - params.Depth = region->depth_scalar; - - params.srcMemoryType = src_type; - params.srcDevice = src_type == hipMemoryTypeDevice - ? *static_cast(src_ptr) - : 0; - params.srcHost = src_type == hipMemoryTypeHost ? src_ptr : nullptr; - params.srcXInBytes = src_offset->x_bytes; - params.srcY = src_offset->y_scalar; - params.srcZ = src_offset->z_scalar; - params.srcPitch = src_row_pitch; - params.srcHeight = src_slice_pitch / src_row_pitch; - - params.dstMemoryType = dst_type; - params.dstDevice = dst_type == hipMemoryTypeDevice - ? *reinterpret_cast(dst_ptr) - : 0; - params.dstHost = dst_type == hipMemoryTypeHost ? dst_ptr : nullptr; - params.dstXInBytes = dst_offset->x_bytes; - params.dstY = dst_offset->y_scalar; - params.dstZ = dst_offset->z_scalar; - params.dstPitch = dst_row_pitch; - params.dstHeight = dst_slice_pitch / dst_row_pitch; - - return PI_CHECK_ERROR(hipDrvMemcpy3DAsync(¶ms, hip_stream)); - - return PI_SUCCESS; -} - -pi_result hip_piEnqueueMemBufferReadRect( - pi_queue command_queue, pi_mem buffer, pi_bool blocking_read, - pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, - pi_buff_rect_region region, size_t buffer_row_pitch, - size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, - void *ptr, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - - assert(buffer != nullptr); - assert(command_queue != nullptr); - - pi_result retErr = PI_SUCCESS; - void *devPtr = buffer->mem_.buffer_mem_.get_void(); - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - hipStream_t hipStream = command_queue->get_next_transfer_stream(); - - retErr = enqueueEventsWait(command_queue, hipStream, - num_events_in_wait_list, event_wait_list); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue, hipStream)); - retImplEv->start(); - } - - retErr = commonEnqueueMemBufferCopyRect( - hipStream, region, &devPtr, hipMemoryTypeDevice, buffer_offset, - buffer_row_pitch, buffer_slice_pitch, ptr, hipMemoryTypeHost, - host_offset, host_row_pitch, host_slice_pitch); - - if (event) { - retErr = map_ur_error(retImplEv->record()); - } - - if (blocking_read) { - retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream)); - } - - if (event) { - *event = retImplEv.release(); - } - - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result hip_piEnqueueMemBufferWriteRect( - pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, - pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, - pi_buff_rect_region region, size_t buffer_row_pitch, - size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, - const void *ptr, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - - assert(buffer != nullptr); - assert(command_queue != nullptr); - - pi_result retErr = PI_SUCCESS; - void *devPtr = buffer->mem_.buffer_mem_.get_void(); - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - hipStream_t hipStream = command_queue->get_next_transfer_stream(); - retErr = enqueueEventsWait(command_queue, hipStream, - num_events_in_wait_list, event_wait_list); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue, hipStream)); - retImplEv->start(); - } - - retErr = commonEnqueueMemBufferCopyRect( - hipStream, region, ptr, hipMemoryTypeHost, host_offset, host_row_pitch, - host_slice_pitch, &devPtr, hipMemoryTypeDevice, buffer_offset, - buffer_row_pitch, buffer_slice_pitch); - - if (event) { - retErr = map_ur_error(retImplEv->record()); - } - - if (blocking_write) { - retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream)); - } - - if (event) { - *event = retImplEv.release(); - } - - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result hip_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, - pi_mem dst_buffer, size_t src_offset, - size_t dst_offset, size_t size, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - if (!command_queue) { - return PI_ERROR_INVALID_QUEUE; - } - - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - pi_result result; - auto stream = command_queue->get_next_transfer_stream(); - - if (event_wait_list) { - result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, - event_wait_list); - } - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream)); - result = map_ur_error(retImplEv->start()); - } - - auto src = src_buffer->mem_.buffer_mem_.get_with_offset(src_offset); - auto dst = dst_buffer->mem_.buffer_mem_.get_with_offset(dst_offset); - - result = PI_CHECK_ERROR(hipMemcpyDtoDAsync(dst, src, size, stream)); - - if (event) { - result = map_ur_error(retImplEv->record()); - *event = retImplEv.release(); - } - - return result; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } -} - -pi_result hip_piEnqueueMemBufferCopyRect( - pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, - pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, - pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, - size_t dst_row_pitch, size_t dst_slice_pitch, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event) { - - assert(src_buffer != nullptr); - assert(dst_buffer != nullptr); - assert(command_queue != nullptr); - - pi_result retErr = PI_SUCCESS; - void *srcPtr = src_buffer->mem_.buffer_mem_.get_void(); - void *dstPtr = dst_buffer->mem_.buffer_mem_.get_void(); - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - hipStream_t hipStream = command_queue->get_next_transfer_stream(); - retErr = enqueueEventsWait(command_queue, hipStream, - num_events_in_wait_list, event_wait_list); - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue, hipStream)); - retImplEv->start(); - } - - retErr = commonEnqueueMemBufferCopyRect( - hipStream, region, &srcPtr, hipMemoryTypeDevice, src_origin, - src_row_pitch, src_slice_pitch, &dstPtr, hipMemoryTypeDevice, - dst_origin, dst_row_pitch, dst_slice_pitch); - - if (event) { - retImplEv->record(); - *event = retImplEv.release(); - } - - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result hip_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, - const void *pattern, size_t pattern_size, - size_t offset, size_t size, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - assert(command_queue != nullptr); - - auto args_are_multiples_of_pattern_size = - (offset % pattern_size == 0) || (size % pattern_size == 0); - - auto pattern_is_valid = (pattern != nullptr); - - auto pattern_size_is_valid = - ((pattern_size & (pattern_size - 1)) == 0) && // is power of two - (pattern_size > 0) && (pattern_size <= 128); // falls within valid range - - assert(args_are_multiples_of_pattern_size && pattern_is_valid && - pattern_size_is_valid); - (void)args_are_multiples_of_pattern_size; - (void)pattern_is_valid; - (void)pattern_size_is_valid; - - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(command_queue->get_context()); - - auto stream = command_queue->get_next_transfer_stream(); - pi_result result; - if (event_wait_list) { - result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, - event_wait_list); - } - - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream)); - result = map_ur_error(retImplEv->start()); - } - - auto dstDevice = buffer->mem_.buffer_mem_.get_with_offset(offset); - auto N = size / pattern_size; - - // pattern size in bytes - switch (pattern_size) { - case 1: { - auto value = *static_cast(pattern); - result = PI_CHECK_ERROR(hipMemsetD8Async(dstDevice, value, N, stream)); - break; - } - case 2: { - auto value = *static_cast(pattern); - result = PI_CHECK_ERROR(hipMemsetD16Async(dstDevice, value, N, stream)); - break; - } - case 4: { - auto value = *static_cast(pattern); - result = PI_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, N, stream)); - break; - } - - default: { - // HIP has no memset functions that allow setting values more than 4 - // bytes. PI API lets you pass an arbitrary "pattern" to the buffer - // fill, which can be more than 4 bytes. We must break up the pattern - // into 1 byte values, and set the buffer using multiple strided calls. - // The first 4 patterns are set using hipMemsetD32Async then all - // subsequent 1 byte patterns are set using hipMemset2DAsync which is - // called for each pattern. - - // Calculate the number of patterns, stride, number of times the pattern - // needs to be applied, and the number of times the first 32 bit pattern - // needs to be applied. - auto number_of_steps = pattern_size / sizeof(uint8_t); - auto pitch = number_of_steps * sizeof(uint8_t); - auto height = size / number_of_steps; - auto count_32 = size / sizeof(uint32_t); - - // Get 4-byte chunk of the pattern and call hipMemsetD32Async - auto value = *(static_cast(pattern)); - result = - PI_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, count_32, stream)); - for (auto step = 4u; step < number_of_steps; ++step) { - // take 1 byte of the pattern - value = *(static_cast(pattern) + step); - - // offset the pointer to the part of the buffer we want to write to - auto offset_ptr = reinterpret_cast( - reinterpret_cast(dstDevice) + (step * sizeof(uint8_t))); - - // set all of the pattern chunks - result = PI_CHECK_ERROR(hipMemset2DAsync( - offset_ptr, pitch, value, sizeof(uint8_t), height, stream)); - } - break; - } - } - - if (event) { - result = map_ur_error(retImplEv->record()); - *event = retImplEv.release(); - } - - return result; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } -} - -static size_t imageElementByteSize(hipArray_Format array_format) { - switch (array_format) { - case HIP_AD_FORMAT_UNSIGNED_INT8: - case HIP_AD_FORMAT_SIGNED_INT8: - return 1; - case HIP_AD_FORMAT_UNSIGNED_INT16: - case HIP_AD_FORMAT_SIGNED_INT16: - case HIP_AD_FORMAT_HALF: - return 2; - case HIP_AD_FORMAT_UNSIGNED_INT32: - case HIP_AD_FORMAT_SIGNED_INT32: - case HIP_AD_FORMAT_FLOAT: - return 4; - default: - return 0; - } - sycl::detail::pi::die("Invalid iamge format."); - return 0; -} - -/// General ND memory copy operation for images (where N > 1). -/// This function requires the corresponding HIP context to be at the top of -/// the context stack -/// If the source and/or destination is an array, src_ptr and/or dst_ptr -/// must be a pointer to a hipArray - -static pi_result commonEnqueueMemImageNDCopy( - hipStream_t hip_stream, pi_mem_type img_type, const size_t *region, - const void *src_ptr, const hipMemoryType src_type, const size_t *src_offset, - void *dst_ptr, const hipMemoryType dst_type, const size_t *dst_offset) { - assert(region != nullptr); - - assert(src_type == hipMemoryTypeArray || src_type == hipMemoryTypeHost); - assert(dst_type == hipMemoryTypeArray || dst_type == hipMemoryTypeHost); - - if (img_type == PI_MEM_TYPE_IMAGE2D) { - hip_Memcpy2D cpyDesc; - memset(&cpyDesc, 0, sizeof(cpyDesc)); - cpyDesc.srcMemoryType = src_type; - if (src_type == hipMemoryTypeArray) { - cpyDesc.srcArray = - reinterpret_cast(const_cast(src_ptr)); - cpyDesc.srcXInBytes = src_offset[0]; - cpyDesc.srcY = src_offset[1]; - } else { - cpyDesc.srcHost = src_ptr; - } - cpyDesc.dstMemoryType = dst_type; - if (dst_type == hipMemoryTypeArray) { - cpyDesc.dstArray = - reinterpret_cast(const_cast(dst_ptr)); - cpyDesc.dstXInBytes = dst_offset[0]; - cpyDesc.dstY = dst_offset[1]; - } else { - cpyDesc.dstHost = dst_ptr; - } - cpyDesc.WidthInBytes = region[0]; - cpyDesc.Height = region[1]; - return PI_CHECK_ERROR(hipMemcpyParam2DAsync(&cpyDesc, hip_stream)); - } - - if (img_type == PI_MEM_TYPE_IMAGE3D) { - - HIP_MEMCPY3D cpyDesc; - memset(&cpyDesc, 0, sizeof(cpyDesc)); - cpyDesc.srcMemoryType = src_type; - if (src_type == hipMemoryTypeArray) { - cpyDesc.srcArray = - reinterpret_cast(const_cast(src_ptr)); - cpyDesc.srcXInBytes = src_offset[0]; - cpyDesc.srcY = src_offset[1]; - cpyDesc.srcZ = src_offset[2]; - } else { - cpyDesc.srcHost = src_ptr; - } - cpyDesc.dstMemoryType = dst_type; - if (dst_type == hipMemoryTypeArray) { - cpyDesc.dstArray = reinterpret_cast(dst_ptr); - cpyDesc.dstXInBytes = dst_offset[0]; - cpyDesc.dstY = dst_offset[1]; - cpyDesc.dstZ = dst_offset[2]; - } else { - cpyDesc.dstHost = dst_ptr; - } - cpyDesc.WidthInBytes = region[0]; - cpyDesc.Height = region[1]; - cpyDesc.Depth = region[2]; - return PI_CHECK_ERROR(hipDrvMemcpy3DAsync(&cpyDesc, hip_stream)); - return PI_ERROR_UNKNOWN; - } - - return PI_ERROR_INVALID_VALUE; -} - -// TODO(ur) - this is just a workaround until we port Enqueue -static std::unordered_map UrToPiMemTypeMap = { - {UR_MEM_TYPE_BUFFER, PI_MEM_TYPE_BUFFER}, - {UR_MEM_TYPE_IMAGE2D, PI_MEM_TYPE_IMAGE2D}, - {UR_MEM_TYPE_IMAGE3D, PI_MEM_TYPE_IMAGE3D}, - {UR_MEM_TYPE_IMAGE2D_ARRAY, PI_MEM_TYPE_IMAGE2D_ARRAY}, - {UR_MEM_TYPE_IMAGE1D, PI_MEM_TYPE_IMAGE1D}, - {UR_MEM_TYPE_IMAGE1D_ARRAY, PI_MEM_TYPE_IMAGE1D_ARRAY}, - {UR_MEM_TYPE_IMAGE1D_BUFFER, PI_MEM_TYPE_IMAGE1D_BUFFER}, -}; - -pi_result hip_piEnqueueMemImageRead(pi_queue command_queue, pi_mem image, - pi_bool blocking_read, const size_t *origin, - const size_t *region, size_t row_pitch, - size_t slice_pitch, void *ptr, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - (void)row_pitch; - (void)slice_pitch; - - assert(command_queue != nullptr); - assert(image != nullptr); - assert(image->mem_type_ == _pi_mem::mem_type::surface); - - pi_result retErr = PI_SUCCESS; - - try { - ScopedContext active(command_queue->get_context()); - hipStream_t hipStream = command_queue->get_next_transfer_stream(); - - if (event_wait_list) { - retErr = enqueueEventsWait(command_queue, hipStream, - num_events_in_wait_list, event_wait_list); - } - - hipArray *array = image->mem_.surface_mem_.get_array(); - - hipArray_Format Format; - size_t NumChannels; - getArrayDesc(array, Format, NumChannels); - - int elementByteSize = imageElementByteSize(Format); - - size_t byteOffsetX = origin[0] * elementByteSize * NumChannels; - size_t bytesToCopy = elementByteSize * NumChannels * region[0]; - - // TODO(ur) - this can be removed when porting Enqueue - auto urImgType = image->mem_.surface_mem_.get_image_type(); - pi_mem_type imgType; - if (auto search = UrToPiMemTypeMap.find(urImgType); - search != UrToPiMemTypeMap.end()) { - imgType = search->second; - } else { - return PI_ERROR_UNKNOWN; - } - - size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; - size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]}; - - retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion, - array, hipMemoryTypeArray, srcOffset, - ptr, hipMemoryTypeHost, nullptr); - - if (retErr != PI_SUCCESS) { - return retErr; - } - - if (event) { - auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_READ, - command_queue, hipStream); - new_event->record(); - *event = new_event; - } - - if (blocking_read) { - retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream)); - } - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - return PI_SUCCESS; - return retErr; -} - -pi_result hip_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, - pi_bool blocking_write, - const size_t *origin, const size_t *region, - size_t input_row_pitch, - size_t input_slice_pitch, const void *ptr, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - (void)blocking_write; - (void)input_row_pitch; - (void)input_slice_pitch; - assert(command_queue != nullptr); - assert(image != nullptr); - assert(image->mem_type_ == _pi_mem::mem_type::surface); - - pi_result retErr = PI_SUCCESS; - - try { - ScopedContext active(command_queue->get_context()); - hipStream_t hipStream = command_queue->get_next_transfer_stream(); - - if (event_wait_list) { - retErr = enqueueEventsWait(command_queue, hipStream, - num_events_in_wait_list, event_wait_list); - } - - hipArray *array = image->mem_.surface_mem_.get_array(); - - hipArray_Format Format; - size_t NumChannels; - getArrayDesc(array, Format, NumChannels); - - int elementByteSize = imageElementByteSize(Format); - - size_t byteOffsetX = origin[0] * elementByteSize * NumChannels; - size_t bytesToCopy = elementByteSize * NumChannels * region[0]; - - // TODO(ur) - this can be removed when porting Enqueue - auto urImgType = image->mem_.surface_mem_.get_image_type(); - pi_mem_type imgType; - if (auto search = UrToPiMemTypeMap.find(urImgType); - search != UrToPiMemTypeMap.end()) { - imgType = search->second; - } else { - return PI_ERROR_UNKNOWN; - } - - size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; - size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]}; - - retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion, - ptr, hipMemoryTypeHost, nullptr, array, - hipMemoryTypeArray, dstOffset); - - if (retErr != PI_SUCCESS) { - return retErr; - } - - if (event) { - auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_WRITE, - command_queue, hipStream); - new_event->record(); - *event = new_event; - } - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - return PI_SUCCESS; - - return retErr; -} - -pi_result hip_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, - pi_mem dst_image, const size_t *src_origin, - const size_t *dst_origin, - const size_t *region, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - - assert(src_image->mem_type_ == _pi_mem::mem_type::surface); - assert(dst_image->mem_type_ == _pi_mem::mem_type::surface); - assert(src_image->mem_.surface_mem_.get_image_type() == - dst_image->mem_.surface_mem_.get_image_type()); - - pi_result retErr = PI_SUCCESS; - - try { - ScopedContext active(command_queue->get_context()); - hipStream_t hipStream = command_queue->get_next_transfer_stream(); - if (event_wait_list) { - retErr = enqueueEventsWait(command_queue, hipStream, - num_events_in_wait_list, event_wait_list); - } - - hipArray *srcArray = src_image->mem_.surface_mem_.get_array(); - hipArray_Format srcFormat; - size_t srcNumChannels; - getArrayDesc(srcArray, srcFormat, srcNumChannels); - - hipArray *dstArray = dst_image->mem_.surface_mem_.get_array(); - hipArray_Format dstFormat; - size_t dstNumChannels; - getArrayDesc(dstArray, dstFormat, dstNumChannels); - - assert(srcFormat == dstFormat); - assert(srcNumChannels == dstNumChannels); - - int elementByteSize = imageElementByteSize(srcFormat); - - size_t dstByteOffsetX = dst_origin[0] * elementByteSize * srcNumChannels; - size_t srcByteOffsetX = src_origin[0] * elementByteSize * dstNumChannels; - size_t bytesToCopy = elementByteSize * srcNumChannels * region[0]; - - // TODO(ur) - this can be removed when porting Enqueue - auto urImgType = src_image->mem_.surface_mem_.get_image_type(); - pi_mem_type imgType; - if (auto search = UrToPiMemTypeMap.find(urImgType); - search != UrToPiMemTypeMap.end()) { - imgType = search->second; - } else { - return PI_ERROR_UNKNOWN; - } - - size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; - size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; - size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; - - retErr = commonEnqueueMemImageNDCopy( - hipStream, imgType, adjustedRegion, srcArray, hipMemoryTypeArray, - srcOffset, dstArray, hipMemoryTypeArray, dstOffset); - - if (retErr != PI_SUCCESS) { - return retErr; - } - - if (event) { - auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_COPY, - command_queue, hipStream); - new_event->record(); - *event = new_event; - } - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - return PI_SUCCESS; - return retErr; -} - -/// \TODO Not implemented in HIP. -pi_result hip_piEnqueueMemImageFill(pi_queue command_queue, pi_mem image, - const void *fill_color, - const size_t *origin, const size_t *region, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - (void)command_queue; - (void)image; - (void)fill_color; - (void)origin; - (void)region; - (void)num_events_in_wait_list; - (void)event_wait_list; - (void)event; - - sycl::detail::pi::die("hip_piEnqueueMemImageFill not implemented"); - return {}; -} - -/// Implements mapping on the host using a BufferRead operation. -/// Mapped pointers are stored in the pi_mem object. -/// If the buffer uses pinned host memory a pointer to that memory is returned -/// and no read operation is done. -/// -pi_result hip_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, - pi_bool blocking_map, - pi_map_flags map_flags, size_t offset, - size_t size, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event, void **ret_map) { - assert(ret_map != nullptr); - assert(command_queue != nullptr); - assert(buffer != nullptr); - assert(buffer->mem_type_ == _pi_mem::mem_type::buffer); - - pi_result ret_err = PI_ERROR_INVALID_OPERATION; - const bool is_pinned = buffer->mem_.buffer_mem_.allocMode_ == - _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; - - // Currently no support for overlapping regions - if (buffer->mem_.buffer_mem_.get_map_ptr() != nullptr) { - return ret_err; - } - - // Allocate a pointer in the host to store the mapped information - auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags); - *ret_map = buffer->mem_.buffer_mem_.get_map_ptr(); - if (hostPtr) { - ret_err = PI_SUCCESS; - } - - if (!is_pinned && ((map_flags & PI_MAP_READ) || (map_flags & PI_MAP_WRITE))) { - // Pinned host memory is already on host so it doesn't need to be read. - ret_err = hip_piEnqueueMemBufferRead( - command_queue, buffer, blocking_map, offset, size, hostPtr, - num_events_in_wait_list, event_wait_list, event); - } else { - ScopedContext active(command_queue->get_context()); - - if (is_pinned) { - ret_err = hip_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); - } - - if (event) { - try { - *event = _pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_MAP, command_queue, - command_queue->get_next_transfer_stream()); - (*event)->start(); - (*event)->record(); - } catch (pi_result error) { - ret_err = error; - } - } - } - - return ret_err; -} - -/// Implements the unmap from the host, using a BufferWrite operation. -/// Requires the mapped pointer to be already registered in the given memobj. -/// If memobj uses pinned host memory, this will not do a write. -/// -pi_result hip_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, - void *mapped_ptr, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - pi_result ret_err = PI_SUCCESS; - - assert(command_queue != nullptr); - assert(mapped_ptr != nullptr); - assert(memobj != nullptr); - assert(memobj->mem_type_ == _pi_mem::mem_type::buffer); - assert(memobj->mem_.buffer_mem_.get_map_ptr() != nullptr); - assert(memobj->mem_.buffer_mem_.get_map_ptr() == mapped_ptr); - - const bool is_pinned = memobj->mem_.buffer_mem_.allocMode_ == - _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; - - if (!is_pinned && - ((memobj->mem_.buffer_mem_.get_map_flags() & PI_MAP_WRITE) || - (memobj->mem_.buffer_mem_.get_map_flags() & - PI_MAP_WRITE_INVALIDATE_REGION))) { - // Pinned host memory is only on host so it doesn't need to be written to. - ret_err = hip_piEnqueueMemBufferWrite( - command_queue, memobj, true, - memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr), - memobj->mem_.buffer_mem_.get_size(), mapped_ptr, - num_events_in_wait_list, event_wait_list, event); - } else { - ScopedContext active(command_queue->get_context()); - - if (is_pinned) { - ret_err = hip_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); - } - - if (event) { - try { - *event = _pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, command_queue, - command_queue->get_next_transfer_stream()); - (*event)->start(); - (*event)->record(); - } catch (pi_result error) { - ret_err = error; - } - } - } - - memobj->mem_.buffer_mem_.unmap(mapped_ptr); - return ret_err; -} - -pi_result hip_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, - size_t count, - pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, - pi_event *event) { - - assert(queue != nullptr); - assert(ptr != nullptr); - pi_result result = PI_SUCCESS; - std::unique_ptr<_pi_event> event_ptr{nullptr}; - - try { - ScopedContext active(queue->get_context()); - pi_uint32 stream_token; - _pi_stream_guard guard; - hipStream_t hipStream = queue->get_next_compute_stream( - num_events_in_waitlist, - reinterpret_cast(events_waitlist), guard, - &stream_token); - result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist, - events_waitlist); - if (event) { - event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue, hipStream, stream_token)); - event_ptr->start(); - } - result = PI_CHECK_ERROR( - hipMemsetD8Async(reinterpret_cast(ptr), - (unsigned char)value & 0xFF, count, hipStream)); - if (event) { - result = map_ur_error(event_ptr->record()); - *event = event_ptr.release(); - } - } catch (pi_result err) { - result = err; - } - - return result; -} - -pi_result hip_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, - void *dst_ptr, const void *src_ptr, - size_t size, - pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, - pi_event *event) { - assert(queue != nullptr); - assert(dst_ptr != nullptr); - assert(src_ptr != nullptr); - pi_result result = PI_SUCCESS; - - std::unique_ptr<_pi_event> event_ptr{nullptr}; - - try { - ScopedContext active(queue->get_context()); - hipStream_t hipStream = queue->get_next_transfer_stream(); - result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist, - events_waitlist); - if (event) { - event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, hipStream)); - event_ptr->start(); - } - result = PI_CHECK_ERROR( - hipMemcpyAsync(dst_ptr, src_ptr, size, hipMemcpyDefault, hipStream)); - if (event) { - result = map_ur_error(event_ptr->record()); - } - if (blocking) { - result = PI_CHECK_ERROR(hipStreamSynchronize(hipStream)); - } - if (event) { - *event = event_ptr.release(); - } - } catch (pi_result err) { - result = err; - } - return result; -} - -pi_result hip_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, - size_t size, pi_usm_migration_flags flags, - pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, - pi_event *event) { - - // flags is currently unused so fail if set - if (flags != 0) - return PI_ERROR_INVALID_VALUE; - assert(queue != nullptr); - assert(ptr != nullptr); - pi_result result = PI_SUCCESS; - std::unique_ptr<_pi_event> event_ptr{nullptr}; - - try { - ScopedContext active(queue->get_context()); - hipStream_t hipStream = queue->get_next_transfer_stream(); - result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist, - events_waitlist); - if (event) { - event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, hipStream)); - event_ptr->start(); - } - result = PI_CHECK_ERROR(hipMemPrefetchAsync( - ptr, size, queue->get_context()->get_device()->get(), hipStream)); - if (event) { - result = map_ur_error(event_ptr->record()); - *event = event_ptr.release(); - } - } catch (pi_result err) { - result = err; - } - - return result; -} - -/// USM: memadvise API to govern behavior of automatic migration mechanisms -pi_result hip_piextUSMEnqueueMemAdvise(pi_queue queue, - [[maybe_unused]] const void *ptr, - size_t length, pi_mem_advice advice, - pi_event *event) { - (void)length; - (void)advice; - - assert(queue != nullptr); - assert(ptr != nullptr); - // TODO implement a mapping to hipMemAdvise once the expected behaviour - // of piextUSMEnqueueMemAdvise is detailed in the USM extension - return hip_piEnqueueEventsWait(queue, 0, nullptr, event); - - return PI_SUCCESS; -} - -// TODO: Implement this. Remember to return true for -// PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented. -pi_result hip_piextUSMEnqueueFill2D(pi_queue, void *, size_t, size_t, - const void *, size_t, size_t, pi_uint32, - const pi_event *, pi_event *) { - sycl::detail::pi::die("piextUSMEnqueueFill2D: not implemented"); - return {}; -} - -// TODO: Implement this. Remember to return true for -// PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT when it is implemented. -pi_result hip_piextUSMEnqueueMemset2D(pi_queue, void *, size_t, int, size_t, - size_t, pi_uint32, const pi_event *, - pi_event *) { - sycl::detail::pi::die("hip_piextUSMEnqueueMemset2D: not implemented"); - return {}; -} - -/// 2D Memcpy API -/// -/// \param queue is the queue to submit to -/// \param blocking is whether this operation should block the host -/// \param dst_ptr is the location the data will be copied -/// \param dst_pitch is the total width of the destination memory including -/// padding -/// \param src_ptr is the data to be copied -/// \param dst_pitch is the total width of the source memory including padding -/// \param width is width in bytes of each row to be copied -/// \param height is height the columns to be copied -/// \param num_events_in_waitlist is the number of events to wait on -/// \param events_waitlist is an array of events to wait on -/// \param event is the event that represents this operation -pi_result hip_piextUSMEnqueueMemcpy2D(pi_queue queue, pi_bool blocking, - void *dst_ptr, size_t dst_pitch, - const void *src_ptr, size_t src_pitch, - size_t width, size_t height, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - assert(queue != nullptr); - - pi_result result = PI_SUCCESS; - - try { - ScopedContext active(queue->get_context()); - hipStream_t hipStream = queue->get_next_transfer_stream(); - result = enqueueEventsWait(queue, hipStream, num_events_in_wait_list, - event_wait_list); - if (event) { - (*event) = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, - queue, hipStream); - (*event)->start(); - } - - result = PI_CHECK_ERROR(hipMemcpy2DAsync(dst_ptr, dst_pitch, src_ptr, - src_pitch, width, height, - hipMemcpyDefault, hipStream)); - - if (event) { - (*event)->record(); - } - if (blocking) { - result = PI_CHECK_ERROR(hipStreamSynchronize(hipStream)); - } - } catch (pi_result err) { - result = err; - } - - return result; -} - -pi_result hip_piextEnqueueDeviceGlobalVariableWrite( - pi_queue queue, pi_program program, const char *name, - pi_bool blocking_write, size_t count, size_t offset, const void *src, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event) { - (void)queue; - (void)program; - (void)name; - (void)blocking_write; - (void)count; - (void)offset; - (void)src; - (void)num_events_in_wait_list; - (void)event_wait_list; - (void)event; - - sycl::detail::pi::die( - "hip_piextEnqueueDeviceGlobalVariableWrite not implemented"); - return {}; -} - -pi_result hip_piextEnqueueDeviceGlobalVariableRead( - pi_queue queue, pi_program program, const char *name, pi_bool blocking_read, - size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - (void)queue; - (void)program; - (void)name; - (void)blocking_read; - (void)count; - (void)offset; - (void)dst; - (void)num_events_in_wait_list; - (void)event_wait_list; - (void)event; - - sycl::detail::pi::die( - "hip_piextEnqueueDeviceGlobalVariableRead not implemented"); -} - -/// Host Pipes -pi_result hip_piextEnqueueReadHostPipe(pi_queue queue, pi_program program, - const char *pipe_symbol, - pi_bool blocking, void *ptr, size_t size, - pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, - pi_event *event) { - (void)queue; - (void)program; - (void)pipe_symbol; - (void)blocking; - (void)ptr; - (void)size; - (void)num_events_in_waitlist; - (void)events_waitlist; - (void)event; - - sycl::detail::pi::die("hip_piextEnqueueReadHostPipe not implemented"); - return {}; -} - -pi_result hip_piextEnqueueWriteHostPipe( - pi_queue queue, pi_program program, const char *pipe_symbol, - pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, pi_event *event) { - (void)queue; - (void)program; - (void)pipe_symbol; - (void)blocking; - (void)ptr; - (void)size; - (void)num_events_in_waitlist; - (void)events_waitlist; - (void)event; - - sycl::detail::pi::die("hip_piextEnqueueWriteHostPipe not implemented"); - return {}; -} - const char SupportedVersion[] = _PI_HIP_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { @@ -1985,46 +259,46 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piSamplerGetInfo, pi2ur::piSamplerGetInfo) _PI_CL(piSamplerRetain, pi2ur::piSamplerRetain) _PI_CL(piSamplerRelease, pi2ur::piSamplerRelease) - // Queue commands - _PI_CL(piEnqueueKernelLaunch, hip_piEnqueueKernelLaunch) - _PI_CL(piEnqueueNativeKernel, hip_piEnqueueNativeKernel) - _PI_CL(piEnqueueEventsWait, hip_piEnqueueEventsWait) - _PI_CL(piEnqueueEventsWaitWithBarrier, hip_piEnqueueEventsWaitWithBarrier) - _PI_CL(piEnqueueMemBufferRead, hip_piEnqueueMemBufferRead) - _PI_CL(piEnqueueMemBufferReadRect, hip_piEnqueueMemBufferReadRect) - _PI_CL(piEnqueueMemBufferWrite, hip_piEnqueueMemBufferWrite) - _PI_CL(piEnqueueMemBufferWriteRect, hip_piEnqueueMemBufferWriteRect) - _PI_CL(piEnqueueMemBufferCopy, hip_piEnqueueMemBufferCopy) - _PI_CL(piEnqueueMemBufferCopyRect, hip_piEnqueueMemBufferCopyRect) - _PI_CL(piEnqueueMemBufferFill, hip_piEnqueueMemBufferFill) - _PI_CL(piEnqueueMemImageRead, hip_piEnqueueMemImageRead) - _PI_CL(piEnqueueMemImageWrite, hip_piEnqueueMemImageWrite) - _PI_CL(piEnqueueMemImageCopy, hip_piEnqueueMemImageCopy) - _PI_CL(piEnqueueMemImageFill, hip_piEnqueueMemImageFill) - _PI_CL(piEnqueueMemBufferMap, hip_piEnqueueMemBufferMap) - _PI_CL(piEnqueueMemUnmap, hip_piEnqueueMemUnmap) + // Enqueue commands + _PI_CL(piEnqueueKernelLaunch, pi2ur::piEnqueueKernelLaunch) + _PI_CL(piEnqueueNativeKernel, pi2ur::piEnqueueNativeKernel) + _PI_CL(piEnqueueEventsWait, pi2ur::piEnqueueEventsWait) + _PI_CL(piEnqueueEventsWaitWithBarrier, pi2ur::piEnqueueEventsWaitWithBarrier) + _PI_CL(piEnqueueMemBufferRead, pi2ur::piEnqueueMemBufferRead) + _PI_CL(piEnqueueMemBufferReadRect, pi2ur::piEnqueueMemBufferReadRect) + _PI_CL(piEnqueueMemBufferWrite, pi2ur::piEnqueueMemBufferWrite) + _PI_CL(piEnqueueMemBufferWriteRect, pi2ur::piEnqueueMemBufferWriteRect) + _PI_CL(piEnqueueMemBufferCopy, pi2ur::piEnqueueMemBufferCopy) + _PI_CL(piEnqueueMemBufferCopyRect, pi2ur::piEnqueueMemBufferCopyRect) + _PI_CL(piEnqueueMemBufferFill, pi2ur::piEnqueueMemBufferFill) + _PI_CL(piEnqueueMemImageRead, pi2ur::piEnqueueMemImageRead) + _PI_CL(piEnqueueMemImageWrite, pi2ur::piEnqueueMemImageWrite) + _PI_CL(piEnqueueMemImageCopy, pi2ur::piEnqueueMemImageCopy) + _PI_CL(piEnqueueMemImageFill, pi2ur::piEnqueueMemImageFill) + _PI_CL(piEnqueueMemBufferMap, pi2ur::piEnqueueMemBufferMap) + _PI_CL(piEnqueueMemUnmap, pi2ur::piEnqueueMemUnmap) // USM _PI_CL(piextUSMHostAlloc, pi2ur::piextUSMHostAlloc) _PI_CL(piextUSMDeviceAlloc, pi2ur::piextUSMDeviceAlloc) _PI_CL(piextUSMSharedAlloc, pi2ur::piextUSMSharedAlloc) _PI_CL(piextUSMFree, pi2ur::piextUSMFree) - _PI_CL(piextUSMEnqueueMemset, hip_piextUSMEnqueueMemset) - _PI_CL(piextUSMEnqueueMemcpy, hip_piextUSMEnqueueMemcpy) - _PI_CL(piextUSMEnqueuePrefetch, hip_piextUSMEnqueuePrefetch) - _PI_CL(piextUSMEnqueueMemAdvise, hip_piextUSMEnqueueMemAdvise) - _PI_CL(piextUSMEnqueueMemcpy2D, hip_piextUSMEnqueueMemcpy2D) - _PI_CL(piextUSMEnqueueFill2D, hip_piextUSMEnqueueFill2D) - _PI_CL(piextUSMEnqueueMemset2D, hip_piextUSMEnqueueMemset2D) + _PI_CL(piextUSMEnqueueMemset, pi2ur::piextUSMEnqueueMemset) + _PI_CL(piextUSMEnqueueMemcpy, pi2ur::piextUSMEnqueueMemcpy) + _PI_CL(piextUSMEnqueuePrefetch, pi2ur::piextUSMEnqueuePrefetch) + _PI_CL(piextUSMEnqueueMemAdvise, pi2ur::piextUSMEnqueueMemAdvise) + _PI_CL(piextUSMEnqueueMemcpy2D, pi2ur::piextUSMEnqueueMemcpy2D) + _PI_CL(piextUSMEnqueueFill2D, pi2ur::piextUSMEnqueueFill2D) + _PI_CL(piextUSMEnqueueMemset2D, pi2ur::piextUSMEnqueueMemset2D) _PI_CL(piextUSMGetMemAllocInfo, pi2ur::piextUSMGetMemAllocInfo) // Device global variable _PI_CL(piextEnqueueDeviceGlobalVariableWrite, - hip_piextEnqueueDeviceGlobalVariableWrite) + pi2ur::piextEnqueueDeviceGlobalVariableWrite) _PI_CL(piextEnqueueDeviceGlobalVariableRead, - hip_piextEnqueueDeviceGlobalVariableRead) + pi2ur::piextEnqueueDeviceGlobalVariableRead) // Host Pipe - _PI_CL(piextEnqueueReadHostPipe, hip_piextEnqueueReadHostPipe) - _PI_CL(piextEnqueueWriteHostPipe, hip_piextEnqueueWriteHostPipe) + _PI_CL(piextEnqueueReadHostPipe, pi2ur::piextEnqueueReadHostPipe) + _PI_CL(piextEnqueueWriteHostPipe, pi2ur::piextEnqueueWriteHostPipe) _PI_CL(piextKernelSetArgMemObj, pi2ur::piextKernelSetArgMemObj) _PI_CL(piextKernelSetArgSampler, pi2ur::piextKernelSetArgSampler) diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index b9b3255f21815..110870bdf42e6 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -130,65 +130,6 @@ typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus, struct _pi_event : ur_event_handle_t_ { using ur_event_handle_t_::ur_event_handle_t_; - - // Helpers for queue command implementations until they also get ported to UR - static pi_event - make_native(pi_command_type type, pi_queue queue, hipStream_t stream, - uint32_t stream_token = std::numeric_limits::max()) { - auto urQueue = reinterpret_cast(queue); - static std::unordered_map<_pi_command_type, ur_command_t> cmdMap = { - {PI_COMMAND_TYPE_NDRANGE_KERNEL, UR_COMMAND_KERNEL_LAUNCH}, - {PI_COMMAND_TYPE_MEM_BUFFER_READ, UR_COMMAND_MEM_BUFFER_READ}, - {PI_COMMAND_TYPE_MEM_BUFFER_WRITE, UR_COMMAND_MEM_BUFFER_WRITE}, - {PI_COMMAND_TYPE_MEM_BUFFER_COPY, UR_COMMAND_MEM_BUFFER_COPY}, - {PI_COMMAND_TYPE_MEM_BUFFER_MAP, UR_COMMAND_MEM_BUFFER_MAP}, - {PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, UR_COMMAND_MEM_UNMAP}, - {PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, UR_COMMAND_MEM_BUFFER_READ_RECT}, - {PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, - UR_COMMAND_MEM_BUFFER_WRITE_RECT}, - {PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, UR_COMMAND_MEM_BUFFER_COPY_RECT}, - {PI_COMMAND_TYPE_MEM_BUFFER_FILL, UR_COMMAND_MEM_BUFFER_FILL}, - {PI_COMMAND_TYPE_IMAGE_READ, UR_COMMAND_MEM_IMAGE_READ}, - {PI_COMMAND_TYPE_IMAGE_WRITE, UR_COMMAND_MEM_IMAGE_WRITE}, - {PI_COMMAND_TYPE_IMAGE_COPY, UR_COMMAND_MEM_IMAGE_COPY}, - {PI_COMMAND_TYPE_BARRIER, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER}, - {PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_READ, - UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ}, - {PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_WRITE, - UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE}, - }; - - // TODO(ur): There is no exact mapping for the following commands. Just - // default to KERNEL_LAUNCH for now. - // PI_COMMAND_TYPE_USER - // PI_COMMAND_TYPE_MEM_BUFFER_FILL, - // PI_COMMAND_TYPE_IMAGE_READ, - // PI_COMMAND_TYPE_IMAGE_WRITE, - // PI_COMMAND_TYPE_IMAGE_COPY, - // PI_COMMAND_TYPE_NATIVE_KERNEL, - // PI_COMMAND_TYPE_COPY_BUFFER_TO_IMAGE, - // PI_COMMAND_TYPE_COPY_IMAGE_TO_BUFFER, - // PI_COMMAND_TYPE_MAP_IMAGE, - // PI_COMMAND_TYPE_MARKER, - // PI_COMMAND_TYPE_ACQUIRE_GL_OBJECTS, - // PI_COMMAND_TYPE_RELEASE_GL_OBJECTS, - // PI_COMMAND_TYPE_BARRIER, - // PI_COMMAND_TYPE_MIGRATE_MEM_OBJECTS, - // PI_COMMAND_TYPE_FILL_IMAGE - // PI_COMMAND_TYPE_SVM_FREE - // PI_COMMAND_TYPE_SVM_MEMCPY - // PI_COMMAND_TYPE_SVM_MEMFILL - // PI_COMMAND_TYPE_SVM_MAP - // PI_COMMAND_TYPE_SVM_UNMAP - - ur_command_t urCmd = UR_COMMAND_KERNEL_LAUNCH; - auto cmdIt = cmdMap.find(type); - if (cmdIt != cmdMap.end()) { - urCmd = cmdIt->second; - } - return reinterpret_cast( - ur_event_handle_t_::make_native(urCmd, urQueue, stream, stream_token)); - } }; /// Implementation of PI Program on HIP Module object diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index c673a249dfb78..f93801906f7ab 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -183,6 +183,7 @@ if ("hip" IN_LIST SYCL_ENABLE_PLUGINS) "ur/adapters/hip/context.hpp" "ur/adapters/hip/device.cpp" "ur/adapters/hip/device.hpp" + "ur/adapters/hip/enqueue.cpp" "ur/adapters/hip/event.cpp" "ur/adapters/hip/event.hpp" "ur/adapters/hip/platform.cpp" diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp new file mode 100644 index 0000000000000..a2be12c20c9ab --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp @@ -0,0 +1,1532 @@ +//===--------- enqueue.cpp - HIP Adapter -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "common.hpp" +#include "context.hpp" +#include "event.hpp" +#include "kernel.hpp" +#include "memory.hpp" +#include "queue.hpp" + +namespace { + +static size_t imageElementByteSize(hipArray_Format array_format) { + switch (array_format) { + case HIP_AD_FORMAT_UNSIGNED_INT8: + case HIP_AD_FORMAT_SIGNED_INT8: + return 1; + case HIP_AD_FORMAT_UNSIGNED_INT16: + case HIP_AD_FORMAT_SIGNED_INT16: + case HIP_AD_FORMAT_HALF: + return 2; + case HIP_AD_FORMAT_UNSIGNED_INT32: + case HIP_AD_FORMAT_SIGNED_INT32: + case HIP_AD_FORMAT_FLOAT: + return 4; + default: + sycl::detail::ur::die("Invalid image format."); + } + return 0; +} + +ur_result_t enqueueEventsWait(ur_queue_handle_t command_queue, + hipStream_t stream, + uint32_t num_events_in_wait_list, + const ur_event_handle_t *event_wait_list) { + if (!event_wait_list) { + return UR_RESULT_SUCCESS; + } + try { + ScopedContext active(command_queue->get_context()); + + auto result = forLatestEvents( + event_wait_list, num_events_in_wait_list, + [stream](ur_event_handle_t event) -> ur_result_t { + if (event->get_stream() == stream) { + return UR_RESULT_SUCCESS; + } else { + return UR_CHECK_ERROR(hipStreamWaitEvent(stream, event->get(), 0)); + } + }); + + if (result != UR_RESULT_SUCCESS) { + return result; + } + return UR_RESULT_SUCCESS; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } +} + +void simpleGuessLocalWorkSize(size_t *threadsPerBlock, + const size_t *global_work_size, + const size_t maxThreadsPerBlock[3], + ur_kernel_handle_t kernel) { + assert(threadsPerBlock != nullptr); + assert(global_work_size != nullptr); + assert(kernel != nullptr); + // int recommendedBlockSize, minGrid; + + // UR_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize( + // &minGrid, &recommendedBlockSize, kernel->get(), + // 0, 0)); + + //(void)minGrid; // Not used, avoid warnings + + threadsPerBlock[0] = std::min(maxThreadsPerBlock[0], global_work_size[0]); + + // Find a local work group size that is a divisor of the global + // work group size to produce uniform work groups. + while (0u != (global_work_size[0] % threadsPerBlock[0])) { + --threadsPerBlock[0]; + } +} +} // namespace + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, + size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(!(phEventWaitList == NULL && numEventsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + ur_result_t retErr = UR_RESULT_SUCCESS; + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + hipStream_t hipStream = hQueue->get_next_transfer_stream(); + retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_WRITE, hQueue, hipStream)); + retImplEv->start(); + } + + retErr = UR_CHECK_ERROR( + hipMemcpyHtoDAsync(hBuffer->mem_.buffer_mem_.get_with_offset(offset), + const_cast(pSrc), size, hipStream)); + + if (phEvent) { + retErr = retImplEv->record(); + } + + if (blockingWrite) { + retErr = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + } + + if (phEvent) { + *phEvent = retImplEv.release(); + } + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(!(phEventWaitList == NULL && numEventsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + ur_result_t retErr = UR_RESULT_SUCCESS; + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + hipStream_t hipStream = hQueue->get_next_transfer_stream(); + retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_READ, hQueue, hipStream)); + retImplEv->start(); + } + + retErr = UR_CHECK_ERROR(hipMemcpyDtoHAsync( + pDst, hBuffer->mem_.buffer_mem_.get_with_offset(offset), size, + hipStream)); + + if (phEvent) { + retErr = retImplEv->record(); + } + + if (blockingRead) { + retErr = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + } + + if (phEvent) { + *phEvent = retImplEv.release(); + } + + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pGlobalWorkOffset, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(pGlobalWorkSize, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hQueue->get_context() == hKernel->get_context(), + UR_RESULT_ERROR_INVALID_QUEUE); + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + if (*pGlobalWorkSize == 0) { + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); + } + + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t threadsPerBlock[3] = {32u, 1u, 1u}; + size_t maxWorkGroupSize = 0u; + size_t maxThreadsPerBlock[3] = {}; + bool providedLocalWorkGroupSize = (pLocalWorkSize != nullptr); + + { + ur_result_t retError = urDeviceGetInfo( + hQueue->device_, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES, + sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr); + UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); + + retError = + urDeviceGetInfo(hQueue->device_, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE, + sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr); + UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); + + // The maxWorkGroupsSize = 1024 for AMD GPU + // The maxThreadsPerBlock = {1024, 1024, 1024} + + if (providedLocalWorkGroupSize) { + auto isValid = [&](int dim) { + UR_ASSERT(pLocalWorkSize[dim] <= maxThreadsPerBlock[dim], + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); + // Checks that local work sizes are a divisor of the global work sizes + // which includes that the local work sizes are neither larger than the + // global work sizes and not 0. + UR_ASSERT(pLocalWorkSize != 0, UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); + UR_ASSERT((pGlobalWorkSize[dim] % pLocalWorkSize[dim]) == 0, + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); + threadsPerBlock[dim] = pLocalWorkSize[dim]; + return UR_RESULT_SUCCESS; + }; + + for (size_t dim = 0; dim < workDim; dim++) { + auto err = isValid(dim); + if (err != UR_RESULT_SUCCESS) + return err; + } + } else { + simpleGuessLocalWorkSize(threadsPerBlock, pGlobalWorkSize, + maxThreadsPerBlock, hKernel); + } + } + + UR_ASSERT(maxWorkGroupSize >= size_t(threadsPerBlock[0] * threadsPerBlock[1] * + threadsPerBlock[2]), + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); + + size_t blocksPerGrid[3] = {1u, 1u, 1u}; + + for (size_t i = 0; i < workDim; i++) { + blocksPerGrid[i] = + (pGlobalWorkSize[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i]; + } + + ur_result_t retError = UR_RESULT_SUCCESS; + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + + uint32_t stream_token; + ur_stream_quard guard; + hipStream_t hipStream = hQueue->get_next_compute_stream( + numEventsInWaitList, phEventWaitList, guard, &stream_token); + hipFunction_t hipFunc = hKernel->get(); + + retError = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + + // Set the implicit global offset parameter if kernel has offset variant + if (hKernel->get_with_offset_parameter()) { + std::uint32_t hip_implicit_offset[3] = {0, 0, 0}; + if (pGlobalWorkOffset) { + for (size_t i = 0; i < workDim; i++) { + hip_implicit_offset[i] = + static_cast(pGlobalWorkOffset[i]); + if (pGlobalWorkOffset[i] != 0) { + hipFunc = hKernel->get_with_offset_parameter(); + } + } + } + hKernel->set_implicit_offset_arg(sizeof(hip_implicit_offset), + hip_implicit_offset); + } + + auto argIndices = hKernel->get_arg_indices(); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_KERNEL_LAUNCH, hQueue, hipStream, stream_token)); + retImplEv->start(); + } + + // Set local mem max size if env var is present + static const char *local_mem_sz_ptr = + std::getenv("SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE"); + + if (local_mem_sz_ptr) { + int device_max_local_mem = 0; + retError = UR_CHECK_ERROR(hipDeviceGetAttribute( + &device_max_local_mem, hipDeviceAttributeMaxSharedMemoryPerBlock, + hQueue->get_device()->get())); + + static const int env_val = std::atoi(local_mem_sz_ptr); + if (env_val <= 0 || env_val > device_max_local_mem) { + setErrorMessage("Invalid value specified for " + "SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + retError = UR_CHECK_ERROR(hipFuncSetAttribute( + hipFunc, hipFuncAttributeMaxDynamicSharedMemorySize, env_val)); + } + + retError = UR_CHECK_ERROR(hipModuleLaunchKernel( + hipFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2], + threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], + hKernel->get_local_size(), hipStream, argIndices.data(), nullptr)); + + hKernel->clear_local_size(); + + if (phEvent) { + retError = retImplEv->record(); + *phEvent = retImplEv.release(); + } + } catch (ur_result_t err) { + retError = err; + } + return retError; +} + +/// Enqueues a wait on the given queue for all events. +/// See \ref enqueueEventWait +/// +/// Currently queues are represented by a single in-order stream, therefore +/// every command is an implicit barrier and so urEnqueueEventWait has the +/// same behavior as urEnqueueEventWaitWithBarrier. So urEnqueueEventWait can +/// just call urEnqueueEventWaitWithBarrier. +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( + ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); +} + +/// Enqueues a wait on the given queue for all specified events. +/// See \ref enqueueEventWaitWithBarrier +/// +/// If the events list is empty, the enqueued wait will wait on all previous +/// events in the queue. +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( + ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(!(phEventWaitList == NULL && numEventsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST) + UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST) + + ur_result_t result; + + try { + ScopedContext active(hQueue->get_context()); + uint32_t stream_token; + ur_stream_quard guard; + hipStream_t hipStream = hQueue->get_next_compute_stream( + numEventsInWaitList, + reinterpret_cast(phEventWaitList), guard, + &stream_token); + { + std::lock_guard guard(hQueue->barrier_mutex_); + if (hQueue->barrier_event_ == nullptr) { + UR_CHECK_ERROR(hipEventCreate(&hQueue->barrier_event_)); + } + if (numEventsInWaitList == 0) { // wait on all work + if (hQueue->barrier_tmp_event_ == nullptr) { + UR_CHECK_ERROR(hipEventCreate(&hQueue->barrier_tmp_event_)); + } + hQueue->sync_streams( + [hipStream, tmp_event = hQueue->barrier_tmp_event_](hipStream_t s) { + if (hipStream != s) { + UR_CHECK_ERROR(hipEventRecord(tmp_event, s)); + UR_CHECK_ERROR(hipStreamWaitEvent(hipStream, tmp_event, 0)); + } + }); + } else { // wait just on given events + forLatestEvents( + reinterpret_cast(phEventWaitList), + numEventsInWaitList, + [hipStream](ur_event_handle_t event) -> ur_result_t { + if (event->get_queue()->has_been_synchronized( + event->get_compute_stream_token())) { + return UR_RESULT_SUCCESS; + } else { + return UR_CHECK_ERROR( + hipStreamWaitEvent(hipStream, event->get(), 0)); + } + }); + } + + result = + UR_CHECK_ERROR(hipEventRecord(hQueue->barrier_event_, hipStream)); + for (unsigned int i = 0; i < hQueue->compute_applied_barrier_.size(); + i++) { + hQueue->compute_applied_barrier_[i] = false; + } + for (unsigned int i = 0; i < hQueue->transfer_applied_barrier_.size(); + i++) { + hQueue->transfer_applied_barrier_[i] = false; + } + } + if (result != UR_RESULT_SUCCESS) { + return result; + } + + if (phEvent) { + *phEvent = ur_event_handle_t_::make_native( + UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, hipStream, stream_token); + (*phEvent)->start(); + (*phEvent)->record(); + } + + return UR_RESULT_SUCCESS; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } +} + +/// General 3D memory copy operation. +/// This function requires the corresponding HIP context to be at the top of +/// the context stack +/// If the source and/or destination is on the device, src_ptr and/or dst_ptr +/// must be a pointer to a hipDevPtr +static ur_result_t commonEnqueueMemBufferCopyRect( + hipStream_t hip_stream, ur_rect_region_t region, const void *src_ptr, + const hipMemoryType src_type, ur_rect_offset_t src_offset, + size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr, + const hipMemoryType dst_type, ur_rect_offset_t dst_offset, + size_t dst_row_pitch, size_t dst_slice_pitch) { + + assert(src_type == hipMemoryTypeDevice || src_type == hipMemoryTypeHost); + assert(dst_type == hipMemoryTypeDevice || dst_type == hipMemoryTypeHost); + + src_row_pitch = (!src_row_pitch) ? region.width : src_row_pitch; + src_slice_pitch = + (!src_slice_pitch) ? (region.height * src_row_pitch) : src_slice_pitch; + dst_row_pitch = (!dst_row_pitch) ? region.width : dst_row_pitch; + dst_slice_pitch = + (!dst_slice_pitch) ? (region.height * dst_row_pitch) : dst_slice_pitch; + + HIP_MEMCPY3D params; + + params.WidthInBytes = region.width; + params.Height = region.height; + params.Depth = region.depth; + + params.srcMemoryType = src_type; + params.srcDevice = src_type == hipMemoryTypeDevice + ? *static_cast(src_ptr) + : 0; + params.srcHost = src_type == hipMemoryTypeHost ? src_ptr : nullptr; + params.srcXInBytes = src_offset.x; + params.srcY = src_offset.y; + params.srcZ = src_offset.z; + params.srcPitch = src_row_pitch; + params.srcHeight = src_slice_pitch / src_row_pitch; + + params.dstMemoryType = dst_type; + params.dstDevice = dst_type == hipMemoryTypeDevice + ? *reinterpret_cast(dst_ptr) + : 0; + params.dstHost = dst_type == hipMemoryTypeHost ? dst_ptr : nullptr; + params.dstXInBytes = dst_offset.x; + params.dstY = dst_offset.y; + params.dstZ = dst_offset.z; + params.dstPitch = dst_row_pitch; + params.dstHeight = dst_slice_pitch / dst_row_pitch; + + return UR_CHECK_ERROR(hipDrvMemcpy3DAsync(¶ms, hip_stream)); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(!(phEventWaitList == NULL && numEventsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + UR_ASSERT(!(region.width == 0 || region.height == 0 || region.width == 0), + UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT(!(bufferRowPitch != 0 && bufferRowPitch < region.width), + UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT(!(hostRowPitch != 0 && hostRowPitch < region.width), + UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT(!(bufferSlicePitch != 0 && + bufferSlicePitch < region.height * bufferRowPitch), + UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT(!(bufferSlicePitch != 0 && bufferSlicePitch % bufferRowPitch != 0), + UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT( + !(hostSlicePitch != 0 && hostSlicePitch < region.height * hostRowPitch), + UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT(!(hostSlicePitch != 0 && hostSlicePitch % hostRowPitch != 0), + UR_RESULT_ERROR_INVALID_SIZE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + void *devPtr = hBuffer->mem_.buffer_mem_.get_void(); + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + hipStream_t hipStream = hQueue->get_next_transfer_stream(); + + retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, hipStream)); + retImplEv->start(); + } + + retErr = commonEnqueueMemBufferCopyRect( + hipStream, region, &devPtr, hipMemoryTypeDevice, bufferOrigin, + bufferRowPitch, bufferSlicePitch, pDst, hipMemoryTypeHost, hostOrigin, + hostRowPitch, hostSlicePitch); + + if (phEvent) { + retErr = retImplEv->record(); + } + + if (blockingRead) { + retErr = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + } + + if (phEvent) { + *phEvent = retImplEv.release(); + } + + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + void *devPtr = hBuffer->mem_.buffer_mem_.get_void(); + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + hipStream_t hipStream = hQueue->get_next_transfer_stream(); + retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, hipStream)); + retImplEv->start(); + } + + retErr = commonEnqueueMemBufferCopyRect( + hipStream, region, pSrc, hipMemoryTypeHost, hostOrigin, hostRowPitch, + hostSlicePitch, &devPtr, hipMemoryTypeDevice, bufferOrigin, + bufferRowPitch, bufferSlicePitch); + + if (phEvent) { + retErr = retImplEv->record(); + } + + if (blockingWrite) { + retErr = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + } + + if (phEvent) { + *phEvent = retImplEv.release(); + } + + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( + ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + ur_result_t result; + auto stream = hQueue->get_next_transfer_stream(); + + if (phEventWaitList) { + result = enqueueEventsWait(hQueue, stream, numEventsInWaitList, + phEventWaitList); + } + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_COPY, hQueue, stream)); + result = retImplEv->start(); + } + + auto src = hBufferSrc->mem_.buffer_mem_.get_with_offset(srcOffset); + auto dst = hBufferDst->mem_.buffer_mem_.get_with_offset(dstOffset); + + result = UR_CHECK_ERROR(hipMemcpyDtoDAsync(dst, src, size, stream)); + + if (phEvent) { + result = retImplEv->record(); + *phEvent = retImplEv.release(); + } + + return result; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( + ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, + size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBufferSrc, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBufferDst, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + ur_result_t retErr = UR_RESULT_SUCCESS; + void *srcPtr = hBufferSrc->mem_.buffer_mem_.get_void(); + void *dstPtr = hBufferDst->mem_.buffer_mem_.get_void(); + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + hipStream_t hipStream = hQueue->get_next_transfer_stream(); + retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, hipStream)); + retImplEv->start(); + } + + retErr = commonEnqueueMemBufferCopyRect( + hipStream, region, &srcPtr, hipMemoryTypeDevice, srcOrigin, srcRowPitch, + srcSlicePitch, &dstPtr, hipMemoryTypeDevice, dstOrigin, dstRowPitch, + dstSlicePitch); + + if (phEvent) { + retImplEv->record(); + *phEvent = retImplEv.release(); + } + + } catch (ur_result_t err) { + retErr = err; + } + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, const void *pPattern, + size_t patternSize, size_t offset, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pPattern, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + auto args_are_multiples_of_pattern_size = + (offset % patternSize == 0) || (size % patternSize == 0); + + auto pattern_is_valid = (pPattern != nullptr); + + auto pattern_size_is_valid = + ((patternSize & (patternSize - 1)) == 0) && // is power of two + (patternSize > 0) && (patternSize <= 128); // falls within valid range + + UR_ASSERT(args_are_multiples_of_pattern_size && pattern_is_valid && + pattern_size_is_valid, + UR_RESULT_ERROR_INVALID_VALUE); + (void)args_are_multiples_of_pattern_size; + (void)pattern_is_valid; + (void)pattern_size_is_valid; + + std::unique_ptr retImplEv{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + + auto stream = hQueue->get_next_transfer_stream(); + ur_result_t result; + if (phEventWaitList) { + result = enqueueEventsWait(hQueue, stream, numEventsInWaitList, + phEventWaitList); + } + + if (phEvent) { + retImplEv = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_MEM_BUFFER_FILL, hQueue, stream)); + result = retImplEv->start(); + } + + auto dstDevice = hBuffer->mem_.buffer_mem_.get_with_offset(offset); + auto N = size / patternSize; + + // pattern size in bytes + switch (patternSize) { + case 1: { + auto value = *static_cast(pPattern); + result = UR_CHECK_ERROR(hipMemsetD8Async(dstDevice, value, N, stream)); + break; + } + case 2: { + auto value = *static_cast(pPattern); + result = UR_CHECK_ERROR(hipMemsetD16Async(dstDevice, value, N, stream)); + break; + } + case 4: { + auto value = *static_cast(pPattern); + result = UR_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, N, stream)); + break; + } + + default: { + // HIP has no memset functions that allow setting values more than 4 + // bytes. UR API lets you pass an arbitrary "pattern" to the buffer + // fill, which can be more than 4 bytes. We must break up the pattern + // into 1 byte values, and set the buffer using multiple strided calls. + // The first 4 patterns are set using hipMemsetD32Async then all + // subsequent 1 byte patterns are set using hipMemset2DAsync which is + // called for each pattern. + + // Calculate the number of patterns, stride, number of times the pattern + // needs to be applied, and the number of times the first 32 bit pattern + // needs to be applied. + auto number_of_steps = patternSize / sizeof(uint8_t); + auto pitch = number_of_steps * sizeof(uint8_t); + auto height = size / number_of_steps; + auto count_32 = size / sizeof(uint32_t); + + // Get 4-byte chunk of the pattern and call hipMemsetD32Async + auto value = *(static_cast(pPattern)); + result = + UR_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, count_32, stream)); + for (auto step = 4u; step < number_of_steps; ++step) { + // take 1 byte of the pattern + value = *(static_cast(pPattern) + step); + + // offset the pointer to the part of the buffer we want to write to + auto offset_ptr = reinterpret_cast( + reinterpret_cast(dstDevice) + (step * sizeof(uint8_t))); + + // set all of the pattern chunks + result = UR_CHECK_ERROR(hipMemset2DAsync( + offset_ptr, pitch, value, sizeof(uint8_t), height, stream)); + } + break; + } + } + + if (phEvent) { + result = retImplEv->record(); + *phEvent = retImplEv.release(); + } + + return result; + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } +} + +/// General ND memory copy operation for images (where N > 1). +/// This function requires the corresponding HIP context to be at the top of +/// the context stack +/// If the source and/or destination is an array, src_ptr and/or dst_ptr +/// must be a pointer to a hipArray +static ur_result_t commonEnqueueMemImageNDCopy( + hipStream_t hip_stream, ur_mem_type_t img_type, const size_t *region, + const void *src_ptr, const hipMemoryType src_type, const size_t *src_offset, + void *dst_ptr, const hipMemoryType dst_type, const size_t *dst_offset) { + UR_ASSERT(region, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + UR_ASSERT(src_type == hipMemoryTypeArray || src_type == hipMemoryTypeHost, + UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(dst_type == hipMemoryTypeArray || dst_type == hipMemoryTypeHost, + UR_RESULT_ERROR_INVALID_VALUE); + + if (img_type == UR_MEM_TYPE_IMAGE2D) { + hip_Memcpy2D cpyDesc; + memset(&cpyDesc, 0, sizeof(cpyDesc)); + cpyDesc.srcMemoryType = src_type; + if (src_type == hipMemoryTypeArray) { + cpyDesc.srcArray = + reinterpret_cast(const_cast(src_ptr)); + cpyDesc.srcXInBytes = src_offset[0]; + cpyDesc.srcY = src_offset[1]; + } else { + cpyDesc.srcHost = src_ptr; + } + cpyDesc.dstMemoryType = dst_type; + if (dst_type == hipMemoryTypeArray) { + cpyDesc.dstArray = + reinterpret_cast(const_cast(dst_ptr)); + cpyDesc.dstXInBytes = dst_offset[0]; + cpyDesc.dstY = dst_offset[1]; + } else { + cpyDesc.dstHost = dst_ptr; + } + cpyDesc.WidthInBytes = region[0]; + cpyDesc.Height = region[1]; + return UR_CHECK_ERROR(hipMemcpyParam2DAsync(&cpyDesc, hip_stream)); + } + + if (img_type == UR_MEM_TYPE_IMAGE3D) { + + HIP_MEMCPY3D cpyDesc; + memset(&cpyDesc, 0, sizeof(cpyDesc)); + cpyDesc.srcMemoryType = src_type; + if (src_type == hipMemoryTypeArray) { + cpyDesc.srcArray = + reinterpret_cast(const_cast(src_ptr)); + cpyDesc.srcXInBytes = src_offset[0]; + cpyDesc.srcY = src_offset[1]; + cpyDesc.srcZ = src_offset[2]; + } else { + cpyDesc.srcHost = src_ptr; + } + cpyDesc.dstMemoryType = dst_type; + if (dst_type == hipMemoryTypeArray) { + cpyDesc.dstArray = reinterpret_cast(dst_ptr); + cpyDesc.dstXInBytes = dst_offset[0]; + cpyDesc.dstY = dst_offset[1]; + cpyDesc.dstZ = dst_offset[2]; + } else { + cpyDesc.dstHost = dst_ptr; + } + cpyDesc.WidthInBytes = region[0]; + cpyDesc.Height = region[1]; + cpyDesc.Depth = region[2]; + return UR_CHECK_ERROR(hipDrvMemcpy3DAsync(&cpyDesc, hip_stream)); + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_ERROR_INVALID_VALUE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( + ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingRead, + ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + std::ignore = rowPitch; + std::ignore = slicePitch; + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hImage->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + ur_result_t retErr = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hQueue->get_context()); + hipStream_t hipStream = hQueue->get_next_transfer_stream(); + + if (phEventWaitList) { + retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + } + + hipArray *array = hImage->mem_.surface_mem_.get_array(); + + hipArray_Format Format; + size_t NumChannels; + getArrayDesc(array, Format, NumChannels); + + int elementByteSize = imageElementByteSize(Format); + + size_t byteOffsetX = origin.x * elementByteSize * NumChannels; + size_t bytesToCopy = elementByteSize * NumChannels * region.depth; + + auto imgType = hImage->mem_.surface_mem_.get_image_type(); + + size_t adjustedRegion[3] = {bytesToCopy, region.height, region.height}; + size_t srcOffset[3] = {byteOffsetX, origin.y, origin.z}; + + retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion, + array, hipMemoryTypeArray, srcOffset, + pDst, hipMemoryTypeHost, nullptr); + + if (retErr != UR_RESULT_SUCCESS) { + return retErr; + } + + if (phEvent) { + auto new_event = ur_event_handle_t_::make_native( + UR_COMMAND_MEM_IMAGE_READ, hQueue, hipStream); + new_event->record(); + *phEvent = new_event; + } + + if (blockingRead) { + retErr = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + } + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( + ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingWrite, + ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + std::ignore = blockingWrite; + std::ignore = rowPitch; + std::ignore = slicePitch; + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hImage->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + ur_result_t retErr = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hQueue->get_context()); + hipStream_t hipStream = hQueue->get_next_transfer_stream(); + + if (phEventWaitList) { + retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + } + + hipArray *array = hImage->mem_.surface_mem_.get_array(); + + hipArray_Format Format; + size_t NumChannels; + getArrayDesc(array, Format, NumChannels); + + int elementByteSize = imageElementByteSize(Format); + + size_t byteOffsetX = origin.x * elementByteSize * NumChannels; + size_t bytesToCopy = elementByteSize * NumChannels * region.depth; + + auto imgType = hImage->mem_.surface_mem_.get_image_type(); + + size_t adjustedRegion[3] = {bytesToCopy, region.height, region.height}; + size_t dstOffset[3] = {byteOffsetX, origin.y, origin.z}; + + retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion, + pSrc, hipMemoryTypeHost, nullptr, + array, hipMemoryTypeArray, dstOffset); + + if (retErr != UR_RESULT_SUCCESS) { + return retErr; + } + + if (phEvent) { + auto new_event = ur_event_handle_t_::make_native( + UR_COMMAND_MEM_IMAGE_WRITE, hQueue, hipStream); + new_event->record(); + *phEvent = new_event; + } + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; + + return retErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( + ur_queue_handle_t hQueue, ur_mem_handle_t hImageSrc, + ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE) + UR_ASSERT(hImageSrc, UR_RESULT_ERROR_INVALID_NULL_HANDLE) + UR_ASSERT(hImageDst, UR_RESULT_ERROR_INVALID_NULL_HANDLE) + UR_ASSERT(hImageSrc->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImageDst->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImageSrc->mem_.surface_mem_.get_image_type() == + hImageDst->mem_.surface_mem_.get_image_type(), + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + ur_result_t retErr = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hQueue->get_context()); + hipStream_t hipStream = hQueue->get_next_transfer_stream(); + if (phEventWaitList) { + retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + } + + hipArray *srcArray = hImageSrc->mem_.surface_mem_.get_array(); + hipArray_Format srcFormat; + size_t srcNumChannels; + getArrayDesc(srcArray, srcFormat, srcNumChannels); + + hipArray *dstArray = hImageDst->mem_.surface_mem_.get_array(); + hipArray_Format dstFormat; + size_t dstNumChannels; + getArrayDesc(dstArray, dstFormat, dstNumChannels); + + UR_ASSERT(srcFormat == dstFormat, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(srcNumChannels == dstNumChannels, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + + int elementByteSize = imageElementByteSize(srcFormat); + + size_t dstByteOffsetX = dstOrigin.x * elementByteSize * srcNumChannels; + size_t srcByteOffsetX = srcOrigin.x * elementByteSize * dstNumChannels; + size_t bytesToCopy = elementByteSize * srcNumChannels * region.depth; + + auto imgType = hImageSrc->mem_.surface_mem_.get_image_type(); + + size_t adjustedRegion[3] = {bytesToCopy, region.height, region.width}; + size_t srcOffset[3] = {srcByteOffsetX, srcOrigin.y, srcOrigin.z}; + size_t dstOffset[3] = {dstByteOffsetX, dstOrigin.y, dstOrigin.z}; + + retErr = commonEnqueueMemImageNDCopy( + hipStream, imgType, adjustedRegion, srcArray, hipMemoryTypeArray, + srcOffset, dstArray, hipMemoryTypeArray, dstOffset); + + if (retErr != UR_RESULT_SUCCESS) { + return retErr; + } + + if (phEvent) { + auto new_event = ur_event_handle_t_::make_native( + UR_COMMAND_MEM_IMAGE_COPY, hQueue, hipStream); + new_event->record(); + *phEvent = new_event; + } + } catch (ur_result_t err) { + return err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +/// Implements mapping on the host using a BufferRead operation. +/// Mapped pointers are stored in the ur_mem_handle_t object. +/// If the buffer uses pinned host memory a pointer to that memory is returned +/// and no read operation is done. +/// +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap, + ur_map_flags_t mapFlags, size_t offset, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, void **ppRetMap) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(ppRetMap, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hBuffer->mem_type_ == ur_mem_handle_t_::mem_type::buffer, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + ur_result_t ret_err = UR_RESULT_ERROR_INVALID_OPERATION; + const bool is_pinned = + hBuffer->mem_.buffer_mem_.allocMode_ == + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; + + // Currently no support for overlapping regions + if (hBuffer->mem_.buffer_mem_.get_map_ptr() != nullptr) { + return ret_err; + } + + // Allocate a pointer in the host to store the mapped information + auto hostPtr = hBuffer->mem_.buffer_mem_.map_to_ptr(offset, mapFlags); + *ppRetMap = hBuffer->mem_.buffer_mem_.get_map_ptr(); + if (hostPtr) { + ret_err = UR_RESULT_SUCCESS; + } + + if (!is_pinned && + ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) { + // Pinned host memory is already on host so it doesn't need to be read. + ret_err = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size, + hostPtr, numEventsInWaitList, + phEventWaitList, phEvent); + } else { + ScopedContext active(hQueue->get_context()); + + if (is_pinned) { + ret_err = urEnqueueEventsWait(hQueue, numEventsInWaitList, + phEventWaitList, nullptr); + } + + if (phEvent) { + try { + *phEvent = + ur_event_handle_t_::make_native(UR_COMMAND_MEM_BUFFER_MAP, hQueue, + hQueue->get_next_transfer_stream()); + (*phEvent)->start(); + (*phEvent)->record(); + } catch (ur_result_t error) { + ret_err = error; + } + } + } + + return ret_err; +} + +/// Implements the unmap from the host, using a BufferWrite operation. +/// Requires the mapped pointer to be already registered in the given hMem. +/// If hMem uses pinned host memory, this will not do a write. +/// +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( + ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + ur_result_t ret_err = UR_RESULT_SUCCESS; + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pMappedPtr, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hMem->mem_type_ == ur_mem_handle_t_::mem_type::buffer, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hMem->mem_.buffer_mem_.get_map_ptr() != nullptr, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hMem->mem_.buffer_mem_.get_map_ptr() == pMappedPtr, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + const bool is_pinned = + hMem->mem_.buffer_mem_.allocMode_ == + ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; + + if (!is_pinned && + ((hMem->mem_.buffer_mem_.get_map_flags() & UR_MAP_FLAG_WRITE) || + (hMem->mem_.buffer_mem_.get_map_flags() & + UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) { + // Pinned host memory is only on host so it doesn't need to be written to. + ret_err = urEnqueueMemBufferWrite( + hQueue, hMem, true, hMem->mem_.buffer_mem_.get_map_offset(pMappedPtr), + hMem->mem_.buffer_mem_.get_size(), pMappedPtr, numEventsInWaitList, + phEventWaitList, phEvent); + } else { + ScopedContext active(hQueue->get_context()); + + if (is_pinned) { + ret_err = urEnqueueEventsWait(hQueue, numEventsInWaitList, + phEventWaitList, nullptr); + } + + if (phEvent) { + try { + *phEvent = ur_event_handle_t_::make_native( + UR_COMMAND_MEM_UNMAP, hQueue, hQueue->get_next_transfer_stream()); + (*phEvent)->start(); + (*phEvent)->record(); + } catch (ur_result_t error) { + ret_err = error; + } + } + } + + hMem->mem_.buffer_mem_.unmap(pMappedPtr); + return ret_err; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( + ur_queue_handle_t hQueue, void *ptr, size_t patternSize, + const void *pPattern, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(pPattern, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + ur_result_t result = UR_RESULT_SUCCESS; + std::unique_ptr event_ptr{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + uint32_t stream_token; + ur_stream_quard guard; + hipStream_t hipStream = hQueue->get_next_compute_stream( + numEventsInWaitList, phEventWaitList, guard, &stream_token); + result = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + if (phEvent) { + event_ptr = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_USM_FILL, hQueue, hipStream, stream_token)); + event_ptr->start(); + } + switch (patternSize) { + case 1: + result = UR_CHECK_ERROR( + hipMemsetD8Async(reinterpret_cast(ptr), + *(const uint8_t *)pPattern & 0xFF, size, hipStream)); + break; + case 2: + result = UR_CHECK_ERROR(hipMemsetD16Async( + reinterpret_cast(ptr), + *(const uint16_t *)pPattern & 0xFFFF, size, hipStream)); + break; + case 4: + result = UR_CHECK_ERROR(hipMemsetD32Async( + reinterpret_cast(ptr), + *(const uint32_t *)pPattern & 0xFFFFFFFF, size, hipStream)); + break; + + default: + return UR_RESULT_ERROR_INVALID_ARGUMENT; + } + + if (phEvent) { + result = event_ptr->record(); + *phEvent = event_ptr.release(); + } + } catch (ur_result_t err) { + result = err; + } + + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( + ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + ur_result_t result = UR_RESULT_SUCCESS; + + std::unique_ptr event_ptr{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + hipStream_t hipStream = hQueue->get_next_transfer_stream(); + result = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + if (phEvent) { + event_ptr = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_USM_MEMCPY, hQueue, hipStream)); + event_ptr->start(); + } + result = UR_CHECK_ERROR( + hipMemcpyAsync(pDst, pSrc, size, hipMemcpyDefault, hipStream)); + if (phEvent) { + result = event_ptr->record(); + } + if (blocking) { + result = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + } + if (phEvent) { + *phEvent = event_ptr.release(); + } + } catch (ur_result_t err) { + result = err; + } + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( + ur_queue_handle_t hQueue, const void *pMem, size_t size, + ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + // flags is currently unused so fail if set + if (flags != 0) + return UR_RESULT_ERROR_INVALID_VALUE; + ur_result_t result = UR_RESULT_SUCCESS; + std::unique_ptr event_ptr{nullptr}; + + try { + ScopedContext active(hQueue->get_context()); + hipStream_t hipStream = hQueue->get_next_transfer_stream(); + result = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + if (phEvent) { + event_ptr = + std::unique_ptr(ur_event_handle_t_::make_native( + UR_COMMAND_USM_PREFETCH, hQueue, hipStream)); + event_ptr->start(); + } + result = UR_CHECK_ERROR(hipMemPrefetchAsync( + pMem, size, hQueue->get_context()->get_device()->get(), hipStream)); + if (phEvent) { + result = event_ptr->record(); + *phEvent = event_ptr.release(); + } + } catch (ur_result_t err) { + result = err; + } + + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, + ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) { + std::ignore = size; + std::ignore = advice; + + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + // TODO implement a mapping to hipMemAdvise once the expected behaviour + // of urEnqueueUSMAdvise is detailed in the USM extension + return urEnqueueEventsWait(hQueue, 0, nullptr, phEvent); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D( + ur_queue_handle_t hQueue, void *pMem, size_t pitch, size_t patternSize, + const void *pPattern, size_t width, size_t height, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + std::ignore = hQueue; + std::ignore = pMem; + std::ignore = pitch; + std::ignore = patternSize; + std::ignore = pPattern; + std::ignore = width; + std::ignore = height; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +/// 2D Memcpy API +/// +/// \param hQueue is the queue to submit to +/// \param blocking is whether this operation should block the host +/// \param pDst is the location the data will be copied +/// \param dstPitch is the total width of the destination memory including +/// padding +/// \param pSrc is the data to be copied +/// \param srcPitch is the total width of the source memory including padding +/// \param width is width in bytes of each row to be copied +/// \param height is height the columns to be copied +/// \param numEventsInWaitList is the number of events to wait on +/// \param phEventWaitList is an array of events to wait on +/// \param phEvent is the event that represents this operation +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( + ur_queue_handle_t hQueue, bool blocking, void *pDst, size_t dstPitch, + const void *pSrc, size_t srcPitch, size_t width, size_t height, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + ur_result_t result = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hQueue->get_context()); + hipStream_t hipStream = hQueue->get_next_transfer_stream(); + result = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + phEventWaitList); + if (phEvent) { + (*phEvent) = ur_event_handle_t_::make_native(UR_COMMAND_USM_MEMCPY_2D, + hQueue, hipStream); + (*phEvent)->start(); + } + + result = + UR_CHECK_ERROR(hipMemcpy2DAsync(pDst, dstPitch, pSrc, srcPitch, width, + height, hipMemcpyDefault, hipStream)); + + if (phEvent) { + (*phEvent)->record(); + } + if (blocking) { + result = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + } + } catch (ur_result_t err) { + result = err; + } + + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, + bool blockingWrite, size_t count, size_t offset, const void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = hProgram; + std::ignore = name; + std::ignore = blockingWrite; + std::ignore = count; + std::ignore = offset; + std::ignore = pSrc; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, + bool blockingRead, size_t count, size_t offset, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = hProgram; + std::ignore = name; + std::ignore = blockingRead; + std::ignore = count; + std::ignore = offset; + std::ignore = pDst; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueReadHostPipe( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, void *pDst, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = hProgram; + std::ignore = pipe_symbol; + std::ignore = blocking; + std::ignore = pDst; + std::ignore = size; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, void *pSrc, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = hProgram; + std::ignore = pipe_symbol; + std::ignore = blocking; + std::ignore = pSrc; + std::ignore = size; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} From 9291c19e44d5a1c17aac46d03b1d6593bc25cb10 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Fri, 19 May 2023 15:18:42 +0100 Subject: [PATCH 14/42] [SYCL][HIP][UR] Add urGetLastResult to common.cpp --- .../unified_runtime/ur/adapters/hip/common.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp index e2c4f967b24a6..c534232a045d1 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp @@ -94,3 +94,15 @@ thread_local char ErrorMessage[MaxMessageSize]; strcpy(ErrorMessage, message); ErrorMessageCode = error_code; } + +ur_result_t zerPluginGetLastError(char **message) { + *message = &ErrorMessage[0]; + return ErrorMessageCode; +} + +// Returns plugin specific error and warning messages; common implementation +// that can be shared between adapters +ur_result_t urGetLastResult(ur_platform_handle_t, const char **ppMessage) { + *ppMessage = &ErrorMessage[0]; + return ErrorMessageCode; +} From d4ee7890e076712392be2820654458e9ceb9bcef Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Fri, 19 May 2023 15:33:03 +0100 Subject: [PATCH 15/42] [SYCL][HIP][UR] Cleanup pi_hip.cpp/.hpp --- sycl/plugins/hip/pi_hip.cpp | 117 ------------------ sycl/plugins/hip/pi_hip.hpp | 82 ------------ .../ur/adapters/hip/context.hpp | 38 ++++++ .../ur/adapters/hip/kernel.hpp | 16 +++ .../ur/adapters/hip/platform.hpp | 5 + .../ur/adapters/hip/program.hpp | 2 + 6 files changed, 61 insertions(+), 199 deletions(-) diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 3873c7910c5b4..39a8189e01657 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -27,123 +27,6 @@ #include #include -namespace { -pi_result map_error(hipError_t result) { - switch (result) { - case hipSuccess: - return PI_SUCCESS; - case hipErrorInvalidContext: - return PI_ERROR_INVALID_CONTEXT; - case hipErrorInvalidDevice: - return PI_ERROR_INVALID_DEVICE; - case hipErrorInvalidValue: - return PI_ERROR_INVALID_VALUE; - case hipErrorOutOfMemory: - return PI_ERROR_OUT_OF_HOST_MEMORY; - case hipErrorLaunchOutOfResources: - return PI_ERROR_OUT_OF_RESOURCES; - default: - return PI_ERROR_UNKNOWN; - } -} - -// Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR -constexpr size_t MaxMessageSize = 256; -thread_local pi_result ErrorMessageCode = PI_SUCCESS; -thread_local char ErrorMessage[MaxMessageSize]; - -// Utility function for setting a message and warning -[[maybe_unused]] static void setErrorMessage(const char *message, - pi_result error_code) { - assert(strlen(message) <= MaxMessageSize); - strcpy(ErrorMessage, message); - ErrorMessageCode = error_code; -} - -/// Converts HIP error into PI error codes, and outputs error information -/// to stderr. -/// If PI_HIP_ABORT env variable is defined, it aborts directly instead of -/// throwing the error. This is intended for debugging purposes. -/// \return PI_SUCCESS if \param result was hipSuccess. -/// \throw pi_error exception (integer) if input was not success. -/// -pi_result check_error(hipError_t result, const char *function, int line, - const char *file) { - if (result == hipSuccess) { - return PI_SUCCESS; - } - - if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) { - const char *errorString = nullptr; - const char *errorName = nullptr; - errorName = hipGetErrorName(result); - errorString = hipGetErrorString(result); - std::stringstream ss; - ss << "\nPI HIP ERROR:" - << "\n\tValue: " << result - << "\n\tName: " << errorName - << "\n\tDescription: " << errorString - << "\n\tFunction: " << function << "\n\tSource Location: " << file - << ":" << line << "\n" - << std::endl; - std::cerr << ss.str(); - } - - if (std::getenv("PI_HIP_ABORT") != nullptr) { - std::abort(); - } - - throw map_error(result); -} - -/// \cond NODOXY -#define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__) - -} // anonymous namespace - -/// ------ Error handling, matching OpenCL plugin semantics. -namespace sycl { -__SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace detail { -namespace pi { - -// Report error and no return (keeps compiler from printing warnings). -// TODO: Probably change that to throw a catchable exception, -// but for now it is useful to see every failure. -// -[[noreturn]] void die(const char *Message) { - std::cerr << "pi_die: " << Message << std::endl; - std::terminate(); -} - -// Reports error messages -void hipPrint(const char *Message) { - std::cerr << "pi_print: " << Message << std::endl; -} - -void assertion(bool Condition, const char *Message) { - if (!Condition) - die(Message); -} - -} // namespace pi -} // namespace detail -} // __SYCL_INLINE_VER_NAMESPACE(_V1) -} // namespace sycl - -/// \endcond - -// makes all future work submitted to queue wait for all work captured in event. -pi_result enqueueEventWait(pi_queue queue, pi_event event) { - // for native events, the hipStreamWaitEvent call is used. - // This makes all future work submitted to stream wait for all - // work captured in event. - queue->for_each_stream([e = event->get()](hipStream_t s) { - PI_CHECK_ERROR(hipStreamWaitEvent(s, e, 0)); - }); - return PI_SUCCESS; -} - //-- PI API implementation extern "C" { diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index 110870bdf42e6..7fd71881b7c83 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -53,70 +53,18 @@ using _pi_stream_guard = std::unique_lock; -/// A PI platform stores all known PI devices, -/// in the HIP plugin this is just a vector of -/// available devices since initialization is done -/// when devices are used. -/// struct _pi_platform : ur_platform_handle_t_ { using ur_platform_handle_t_::ur_platform_handle_t_; }; -/// PI device mapping to a hipDevice_t. -/// Includes an observer pointer to the platform, -/// and implements the reference counting semantics since -/// HIP objects are not refcounted. -/// struct _pi_device : ur_device_handle_t_ { using ur_device_handle_t_::ur_device_handle_t_; }; -/// PI context mapping to a HIP context object. -/// -/// There is no direct mapping between a HIP context and a PI context, -/// main differences described below: -/// -/// HIP context vs PI context -/// -/// One of the main differences between the PI API and the HIP driver API is -/// that the second modifies the state of the threads by assigning -/// `hipCtx_t` objects to threads. `hipCtx_t` objects store data associated -/// with a given device and control access to said device from the user side. -/// PI API context are objects that are passed to functions, and not bound -/// to threads. -/// The _pi_context object doesn't implement this behavior, only holds the -/// HIP context data. The RAII object \ref ScopedContext implements the active -/// context behavior. -/// -/// Primary vs User-defined context -/// -/// HIP has two different types of context, the Primary context, -/// which is usable by all threads on a given process for a given device, and -/// the aforementioned custom contexts. -/// HIP documentation, and performance analysis, indicates it is recommended -/// to use Primary context whenever possible. -/// Primary context is used as well by the HIP Runtime API. -/// For PI applications to interop with HIP Runtime API, they have to use -/// the primary context - and make that active in the thread. -/// The `_pi_context` object can be constructed with a `kind` parameter -/// that allows to construct a Primary or `user-defined` context, so that -/// the PI object interface is always the same. -/// -/// Destructor callback -/// -/// Required to implement CP023, SYCL Extended Context Destruction, -/// the PI Context can store a number of callback functions that will be -/// called upon destruction of the PI Context. -/// See proposal for details. -/// struct _pi_context : ur_context_handle_t_ { using ur_context_handle_t_::ur_context_handle_t_; }; -/// PI Mem mapping to HIP memory allocations, both data and texture/surface. -/// \brief Represents non-SVM allocations on the HIP backend. -/// Keeps tracks of all mapped regions used for Map/Unmap calls. -/// Only one region can be active at the same time per allocation. struct _pi_mem : ur_mem_handle_t_ { using ur_mem_handle_t_::ur_mem_handle_t_; }; @@ -125,50 +73,20 @@ struct _pi_queue : ur_queue_handle_t_ { using ur_queue_handle_t_::ur_queue_handle_t_; }; -typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus, - void *userData); - struct _pi_event : ur_event_handle_t_ { using ur_event_handle_t_::ur_event_handle_t_; }; -/// Implementation of PI Program on HIP Module object -/// struct _pi_program : ur_program_handle_t_ { using ur_program_handle_t_::ur_program_handle_t_; }; -/// Implementation of a PI Kernel for HIP -/// -/// PI Kernels are used to set kernel arguments, -/// creating a state on the Kernel object for a given -/// invocation. This is not the case of HIPFunction objects, -/// which are simply passed together with the arguments on the invocation. -/// The PI Kernel implementation for HIP stores the list of arguments, -/// argument sizes and offsets to emulate the interface of PI Kernel, -/// saving the arguments for the later dispatch. -/// Note that in PI API, the Local memory is specified as a size per -/// individual argument, but in HIP only the total usage of shared -/// memory is required since it is not passed as a parameter. -/// A compiler pass converts the PI API local memory model into the -/// HIP shared model. This object simply calculates the total of -/// shared memory, and the initial offsets of each parameter. -/// struct _pi_kernel : ur_kernel_handle_t_ { using ur_kernel_handle_t_::ur_kernel_handle_t_; }; -/// Implementation of samplers for HIP -/// -/// Sampler property layout: -/// | 31 30 ... 6 5 | 4 3 2 | 1 | 0 | -/// | N/A | addressing mode | fiter mode | normalize coords | struct _pi_sampler : ur_sampler_handle_t_ { using ur_sampler_handle_t_::ur_sampler_handle_t_; }; -// ------------------------------------------------------------- -// Helper types and functions -// - #endif // PI_HIP_HPP diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp index 05f246ef7dc1a..98b8702213f2e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp @@ -13,6 +13,44 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data); +/// UR context mapping to a HIP context object. +/// +/// There is no direct mapping between a HIP context and a UR context, +/// main differences described below: +/// +/// HIP context vs UR context +/// +/// One of the main differences between the UR API and the HIP driver API is +/// that the second modifies the state of the threads by assigning +/// `hipCtx_t` objects to threads. `hipCtx_t` objects store data associated +/// with a given device and control access to said device from the user side. +/// UR API context are objects that are passed to functions, and not bound +/// to threads. +/// The ur_context_handle_t_ object doesn't implement this behavior, only holds the +/// HIP context data. The RAII object \ref ScopedContext implements the active +/// context behavior. +/// +/// Primary vs User-defined context +/// +/// HIP has two different types of context, the Primary context, +/// which is usable by all threads on a given process for a given device, and +/// the aforementioned custom contexts. +/// HIP documentation, and performance analysis, indicates it is recommended +/// to use Primary context whenever possible. +/// Primary context is used as well by the HIP Runtime API. +/// For UR applications to interop with HIP Runtime API, they have to use +/// the primary context - and make that active in the thread. +/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter +/// that allows to construct a Primary or `user-defined` context, so that +/// the UR object interface is always the same. +/// +/// Destructor callback +/// +/// Required to implement CP023, SYCL Extended Context Destruction, +/// the UR Context can store a number of callback functions that will be +/// called upon destruction of the UR Context. +/// See proposal for details. +/// struct ur_context_handle_t_ { struct deleter_data { diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp index d2fb5f6be288c..53fd8368e34a7 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp @@ -15,6 +15,22 @@ #include "program.hpp" +/// Implementation of a UR Kernel for HIP +/// +/// UR Kernels are used to set kernel arguments, +/// creating a state on the Kernel object for a given +/// invocation. This is not the case of HIPFunction objects, +/// which are simply passed together with the arguments on the invocation. +/// The UR Kernel implementation for HIP stores the list of arguments, +/// argument sizes and offsets to emulate the interface of UR Kernel, +/// saving the arguments for the later dispatch. +/// Note that in UR API, the Local memory is specified as a size per +/// individual argument, but in HIP only the total usage of shared +/// memory is required since it is not passed as a parameter. +/// A compiler pass converts the UR API local memory model into the +/// HIP shared model. This object simply calculates the total of +/// shared memory, and the initial offsets of each parameter. +/// struct ur_kernel_handle_t_ { using native_type = hipFunction_t; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp index fb89d5bea24ea..cf9c80c2365f5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp @@ -12,6 +12,11 @@ #include +/// A UR platform stores all known UR devices, +/// in the HIP plugin this is just a vector of +/// available devices since initialization is done +/// when devices are used. +/// struct ur_platform_handle_t_ { static hipEvent_t evBase_; // HIP event used as base counter std::vector> devices_; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp index aa1f3a4657df7..9e144798ad0d6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp @@ -13,6 +13,8 @@ #include "context.hpp" +/// Implementation of UR Program on HIP Module object +/// struct ur_program_handle_t_ { using native_type = hipModule_t; native_type module_; From c3807dcb55fd763b4b973729eb4f158a4ededdf0 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Fri, 19 May 2023 16:45:55 +0100 Subject: [PATCH 16/42] [SYCL][HIP][UR] Update DDI tables for enqueue entry-points --- .../ur/adapters/hip/ur_interface_loader.cpp | 48 ++++++++++--------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index 89ab252979488..2181e2edc3640 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -166,29 +166,31 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnDeviceGlobalVariableRead = nullptr; - pDdiTable->pfnDeviceGlobalVariableWrite = nullptr; - pDdiTable->pfnEventsWait = nullptr; - pDdiTable->pfnEventsWaitWithBarrier = nullptr; - pDdiTable->pfnKernelLaunch = nullptr; - pDdiTable->pfnMemBufferCopy = nullptr; - pDdiTable->pfnMemBufferCopyRect = nullptr; - pDdiTable->pfnMemBufferFill = nullptr; - pDdiTable->pfnMemBufferMap = nullptr; - pDdiTable->pfnMemBufferRead = nullptr; - pDdiTable->pfnMemBufferReadRect = nullptr; - pDdiTable->pfnMemBufferWrite = nullptr; - pDdiTable->pfnMemBufferWriteRect = nullptr; - pDdiTable->pfnMemImageCopy = nullptr; - pDdiTable->pfnMemImageRead = nullptr; - pDdiTable->pfnMemImageWrite = nullptr; - pDdiTable->pfnMemUnmap = nullptr; - pDdiTable->pfnUSMFill2D = nullptr; - pDdiTable->pfnUSMFill = nullptr; - pDdiTable->pfnUSMAdvise = nullptr; - pDdiTable->pfnUSMMemcpy2D = nullptr; - pDdiTable->pfnUSMMemcpy = nullptr; - pDdiTable->pfnUSMPrefetch = nullptr; + pDdiTable->pfnDeviceGlobalVariableRead = urEnqueueDeviceGlobalVariableRead; + pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; + pDdiTable->pfnEventsWait = urEnqueueEventsWait; + pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; + pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; + pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy; + pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect; + pDdiTable->pfnMemBufferFill = urEnqueueMemBufferFill; + pDdiTable->pfnMemBufferMap = urEnqueueMemBufferMap; + pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead; + pDdiTable->pfnMemBufferReadRect = urEnqueueMemBufferReadRect; + pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite; + pDdiTable->pfnMemBufferWriteRect = urEnqueueMemBufferWriteRect; + pDdiTable->pfnMemImageCopy = urEnqueueMemImageCopy; + pDdiTable->pfnMemImageRead = urEnqueueMemImageRead; + pDdiTable->pfnMemImageWrite = urEnqueueMemImageWrite; + pDdiTable->pfnMemUnmap = urEnqueueMemUnmap; + pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D; + pDdiTable->pfnUSMFill = urEnqueueUSMFill; + pDdiTable->pfnUSMAdvise = urEnqueueUSMAdvise; + pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D; + pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy; + pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch; + pDdiTable->pfnReadHostPipe = urEnqueueReadHostPipe; + pDdiTable->pfnWriteHostPipe = urEnqueueWriteHostPipe; return UR_RESULT_SUCCESS; } From cc3a9539ade12e4c77da61439d300692e26564d6 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Wed, 24 May 2023 13:55:23 +0100 Subject: [PATCH 17/42] [SYCL][HIP][UR] Remove queue backward compatability apis --- sycl/plugins/hip/pi_hip.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 39a8189e01657..bd9791ee1b696 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -77,18 +77,14 @@ pi_result piPluginInit(pi_plugin *PluginInit) { // Queue _PI_CL(piQueueCreate, pi2ur::piQueueCreate) _PI_CL(piextQueueCreate, pi2ur::piextQueueCreate) - _PI_CL(piextQueueCreate2, pi2ur::piextQueueCreate2) _PI_CL(piQueueGetInfo, pi2ur::piQueueGetInfo) _PI_CL(piQueueFinish, pi2ur::piQueueFinish) _PI_CL(piQueueFlush, pi2ur::piQueueFlush) _PI_CL(piQueueRetain, pi2ur::piQueueRetain) _PI_CL(piQueueRelease, pi2ur::piQueueRelease) _PI_CL(piextQueueGetNativeHandle, pi2ur::piextQueueGetNativeHandle) - _PI_CL(piextQueueGetNativeHandle2, pi2ur::piextQueueGetNativeHandle2) _PI_CL(piextQueueCreateWithNativeHandle, pi2ur::piextQueueCreateWithNativeHandle) - _PI_CL(piextQueueCreateWithNativeHandle2, - pi2ur::piextQueueCreateWithNativeHandle2) // Memory _PI_CL(piMemBufferCreate, pi2ur::piMemBufferCreate) _PI_CL(piMemImageCreate, pi2ur::piMemImageCreate) From f0fb747acabb4093dc99bc9954520deb2fd932e7 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Wed, 24 May 2023 13:58:54 +0100 Subject: [PATCH 18/42] [SYCL][HIP][UR] Add usmPool entry points to ddi tables and fix ur*nativeHandle apis --- sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp | 1 + sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp | 1 + sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp | 2 +- .../unified_runtime/ur/adapters/hip/ur_interface_loader.cpp | 5 +++-- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index 032f113bd0d78..c479644e87bef 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -922,6 +922,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, + const ur_device_native_properties_t *pProperties, ur_device_handle_t *phDevice) { UR_ASSERT(hNativeDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp index 3f6cbd9eb223c..147ea15f32621 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp @@ -139,6 +139,7 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count, /// \return TBD UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( ur_native_handle_t hNativeProgram, ur_context_handle_t hContext, + const ur_program_native_properties_t *pProperties, ur_program_handle_t *phProgram) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp index 272e8b2a29ea7..fb1305e155b19 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp @@ -264,7 +264,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { /// /// \return UR_RESULT_SUCCESS UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( - ur_queue_handle_t hQueue, ur_native_handle_t *phNativeQueue) { + ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, ur_native_handle_t *phNativeQueue) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index 2181e2edc3640..f0eb6008d8a36 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -234,8 +234,9 @@ urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) { pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo; pDdiTable->pfnHostAlloc = urUSMHostAlloc; pDdiTable->pfnPoolCreate = nullptr; - pDdiTable->pfnPoolDestroy = nullptr; - pDdiTable->pfnPoolDestroy = nullptr; + pDdiTable->pfnPoolRetain = nullptr; + pDdiTable->pfnPoolRelease = nullptr; + pDdiTable->pfnPoolGetInfo = nullptr; pDdiTable->pfnSharedAlloc = urUSMSharedAlloc; return UR_RESULT_SUCCESS; } From 0708bcf60e21849362551e7b4fafdf79b46ce746 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Mon, 5 Jun 2023 17:05:29 +0100 Subject: [PATCH 19/42] [SYCL][HIP][UR] Allow urKernelSetArgMemObj to set the arg to nullptr --- sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp index 4c69ad3a37962..c7909fae2f5d6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp @@ -284,7 +284,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( ur_kernel_handle_t hKernel, uint32_t argIndex, ur_mem_handle_t hArgValue) { UR_ASSERT(hKernel != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hArgValue != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + // Below sets kernel arg when zero-sized buffers are handled. + // In such case the corresponding memory is null. + if (hArgValue == nullptr) { + hKernel->set_kernel_arg(argIndex, 0, nullptr); + return UR_RESULT_SUCCESS; + } ur_result_t retErr = UR_RESULT_SUCCESS; try { From 043fbd2b136ffc786f4d7cbaf0648534105648bb Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 8 Jun 2023 17:09:19 +0100 Subject: [PATCH 20/42] [SYCL][HIP][UR] Change the code style to be more consistent --- .../ur/adapters/hip/common.cpp | 76 +- .../ur/adapters/hip/common.hpp | 34 +- .../ur/adapters/hip/context.cpp | 130 +- .../ur/adapters/hip/context.hpp | 112 +- .../ur/adapters/hip/device.cpp | 635 ++++----- .../ur/adapters/hip/device.hpp | 24 +- .../ur/adapters/hip/enqueue.cpp | 1130 ++++++++--------- .../unified_runtime/ur/adapters/hip/event.cpp | 224 ++-- .../unified_runtime/ur/adapters/hip/event.hpp | 152 ++- .../ur/adapters/hip/kernel.cpp | 184 +-- .../ur/adapters/hip/kernel.hpp | 166 +-- .../ur/adapters/hip/memory.cpp | 366 +++--- .../ur/adapters/hip/memory.hpp | 209 ++- .../ur/adapters/hip/platform.cpp | 83 +- .../ur/adapters/hip/platform.hpp | 4 +- .../ur/adapters/hip/program.cpp | 190 ++- .../ur/adapters/hip/program.hpp | 32 +- .../unified_runtime/ur/adapters/hip/queue.cpp | 226 ++-- .../unified_runtime/ur/adapters/hip/queue.hpp | 289 +++-- .../ur/adapters/hip/sampler.cpp | 40 +- .../ur/adapters/hip/sampler.hpp | 16 +- .../ur/adapters/hip/ur_interface_loader.cpp | 6 +- .../unified_runtime/ur/adapters/hip/usm.cpp | 138 +- 23 files changed, 2209 insertions(+), 2257 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp index c534232a045d1..071905d3614e3 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp @@ -9,8 +9,8 @@ #include -ur_result_t map_error_ur(hipError_t result) { - switch (result) { +ur_result_t mapErrorUR(hipError_t Result) { + switch (Result) { case hipSuccess: return UR_RESULT_SUCCESS; case hipErrorInvalidContext: @@ -28,76 +28,70 @@ ur_result_t map_error_ur(hipError_t result) { } } -ur_result_t check_error_ur(hipError_t result, const char *function, int line, - const char *file) { - if (result == hipSuccess) { +ur_result_t checkErrorUR(hipError_t Result, const char *Function, int Line, + const char *File) { + if (Result == hipSuccess) { return UR_RESULT_SUCCESS; } if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) { - const char *errorString = nullptr; - const char *errorName = nullptr; - errorName = hipGetErrorName(result); - errorString = hipGetErrorString(result); - std::stringstream ss; - ss << "\nUR HIP ERROR:" - << "\n\tValue: " << result - << "\n\tName: " << errorName - << "\n\tDescription: " << errorString - << "\n\tFunction: " << function << "\n\tSource Location: " << file - << ":" << line << "\n" + const char *ErrorString = nullptr; + const char *ErrorName = nullptr; + ErrorName = hipGetErrorName(Result); + ErrorString = hipGetErrorString(Result); + std::stringstream SS; + SS << "\nUR HIP ERROR:" + << "\n\tValue: " << Result + << "\n\tName: " << ErrorName + << "\n\tDescription: " << ErrorString + << "\n\tFunction: " << Function << "\n\tSource Location: " << File + << ":" << Line << "\n" << std::endl; - std::cerr << ss.str(); + std::cerr << SS.str(); } if (std::getenv("PI_HIP_ABORT") != nullptr) { std::abort(); } - throw map_error_ur(result); + throw mapErrorUR(Result); } std::string getHipVersionString() { - int driver_version = 0; - if (hipDriverGetVersion(&driver_version) != hipSuccess) { + int DriverVersion = 0; + if (hipDriverGetVersion(&DriverVersion) != hipSuccess) { return ""; } // The version is returned as (1000 major + 10 minor). - std::stringstream stream; - stream << "HIP " << driver_version / 1000 << "." - << driver_version % 1000 / 10; - return stream.str(); + std::stringstream Stream; + Stream << "HIP " << DriverVersion / 1000 << "." << DriverVersion % 1000 / 10; + return Stream.str(); } -void sycl::detail::ur::die(const char *Message) { - std::cerr << "ur_die: " << Message << std::endl; +void sycl::detail::ur::die(const char *pMessage) { + std::cerr << "ur_die: " << pMessage << std::endl; std::terminate(); } -void sycl::detail::ur::assertion(bool Condition, const char *Message) { +void sycl::detail::ur::assertion(bool Condition, const char *pMessage) { if (!Condition) - die(Message); + die(pMessage); } -void sycl::detail::ur::hipPrint(const char *Message) { - std::cerr << "ur_print: " << Message << std::endl; +void sycl::detail::ur::hipPrint(const char *pMessage) { + std::cerr << "ur_print: " << pMessage << std::endl; } -// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR +// Global variables for UR_RESULT_ADAPTER_SPECIFIC_ERROR thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS; thread_local char ErrorMessage[MaxMessageSize]; // Utility function for setting a message and warning -[[maybe_unused]] void setErrorMessage(const char *message, - ur_result_t error_code) { - assert(strlen(message) <= MaxMessageSize); - strcpy(ErrorMessage, message); - ErrorMessageCode = error_code; -} - -ur_result_t zerPluginGetLastError(char **message) { - *message = &ErrorMessage[0]; - return ErrorMessageCode; +[[maybe_unused]] void setErrorMessage(const char *pMessage, + ur_result_t ErrorCode) { + assert(strlen(pMessage) <= MaxMessageSize); + strcpy(ErrorMessage, pMessage); + ErrorMessageCode = ErrorCode; } // Returns plugin specific error and warning messages; common implementation diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp index 010b40d6b46a5..7d010c4a6ac93 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp @@ -15,17 +15,17 @@ // indexed, but on NVidia it is an opaque type and needs to go through // cuArrayGetDescriptor so implement a utility function to get the array // properties -inline void getArrayDesc(hipArray *array, hipArray_Format &format, - size_t &channels) { +inline void getArrayDesc(hipArray *Array, hipArray_Format &Format, + size_t &Channels) { #if defined(__HIP_PLATFORM_AMD__) - format = array->Format; - channels = array->NumChannels; + Format = Array->Format; + Channels = Array->NumChannels; #elif defined(__HIP_PLATFORM_NVIDIA__) - CUDA_ARRAY_DESCRIPTOR arrayDesc; - cuArrayGetDescriptor(&arrayDesc, (CUarray)array); + CUDA_ARRAY_DESCRIPTOR ArrayDesc; + cuArrayGetDescriptor(&ArrayDesc, (CUarray)Array); - format = arrayDesc.Format; - channels = arrayDesc.NumChannels; + Format = ArrayDesc.Format; + Channels = ArrayDesc.NumChannels; #else #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); #endif @@ -66,13 +66,13 @@ typedef hipArray *hipCUarray; #define hipMemoryTypeUnified CU_MEMORYTYPE_UNIFIED #endif -ur_result_t map_error_ur(hipError_t result); +ur_result_t mapErrorUR(hipError_t Result); -ur_result_t check_error_ur(hipError_t result, const char *function, int line, - const char *file); +ur_result_t checkErrorUR(hipError_t Result, const char *Function, int Line, + const char *File); #define UR_CHECK_ERROR(result) \ - check_error_ur(result, __func__, __LINE__, __FILE__) + checkErrorUR(result, __func__, __LINE__, __FILE__) std::string getHipVersionString(); @@ -81,8 +81,8 @@ extern thread_local ur_result_t ErrorMessageCode; extern thread_local char ErrorMessage[MaxMessageSize]; // Utility function for setting a message and warning -[[maybe_unused]] void setErrorMessage(const char *message, - ur_result_t error_code); +[[maybe_unused]] void setErrorMessage(const char *Message, + ur_result_t ErrorCode); /// ------ Error handling, matching OpenCL plugin semantics. namespace sycl { @@ -94,12 +94,12 @@ namespace ur { // TODO: Probably change that to throw a catchable exception, // but for now it is useful to see every failure. // -[[noreturn]] void die(const char *Message); +[[noreturn]] void die(const char *pMessage); // Reports error messages -void hipPrint(const char *Message); +void hipPrint(const char *pMessage); -void assertion(bool Condition, const char *Message = nullptr); +void assertion(bool Condition, const char *pMessage = nullptr); } // namespace ur } // namespace detail diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp index e3949881c4879..16f162ff35031 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp @@ -17,67 +17,68 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, const ur_context_properties_t *pProperties, ur_context_handle_t *phContext) { + std::ignore = pProperties; + UR_ASSERT(phDevices, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(phContext, UR_RESULT_ERROR_INVALID_NULL_POINTER); assert(DeviceCount == 1); - ur_result_t errcode_ret = UR_RESULT_SUCCESS; + ur_result_t RetErr = UR_RESULT_SUCCESS; - std::unique_ptr urContextPtr{nullptr}; + std::unique_ptr ContextPtr{nullptr}; try { - hipCtx_t current = nullptr; + hipCtx_t Current = nullptr; // Create a scoped context. - hipCtx_t newContext; - UR_CHECK_ERROR(hipCtxGetCurrent(¤t)); - errcode_ret = UR_CHECK_ERROR( - hipCtxCreate(&newContext, hipDeviceMapHost, phDevices[0]->get())); - urContextPtr = - std::unique_ptr(new ur_context_handle_t_{ - ur_context_handle_t_::kind::user_defined, newContext, *phDevices}); - - static std::once_flag initFlag; + hipCtx_t NewContext; + UR_CHECK_ERROR(hipCtxGetCurrent(&Current)); + RetErr = UR_CHECK_ERROR( + hipCtxCreate(&NewContext, hipDeviceMapHost, phDevices[0]->get())); + ContextPtr = std::unique_ptr(new ur_context_handle_t_{ + ur_context_handle_t_::kind::UserDefined, NewContext, *phDevices}); + + static std::once_flag InitFlag; std::call_once( - initFlag, - [](ur_result_t &err) { + InitFlag, + [](ur_result_t &Err) { // Use default stream to record base event counter - UR_CHECK_ERROR(hipEventCreateWithFlags( - &ur_platform_handle_t_::evBase_, hipEventDefault)); - UR_CHECK_ERROR(hipEventRecord(ur_platform_handle_t_::evBase_, 0)); + UR_CHECK_ERROR(hipEventCreateWithFlags(&ur_platform_handle_t_::EvBase, + hipEventDefault)); + UR_CHECK_ERROR(hipEventRecord(ur_platform_handle_t_::EvBase, 0)); }, - errcode_ret); + RetErr); // For non-primary scoped contexts keep the last active on top of the stack // as `cuCtxCreate` replaces it implicitly otherwise. // Primary contexts are kept on top of the stack, so the previous context // is not queried and therefore not recovered. - if (current != nullptr) { - UR_CHECK_ERROR(hipCtxSetCurrent(current)); + if (Current != nullptr) { + UR_CHECK_ERROR(hipCtxSetCurrent(Current)); } - *phContext = urContextPtr.release(); - } catch (ur_result_t err) { - errcode_ret = err; + *phContext = ContextPtr.release(); + } catch (ur_result_t Err) { + RetErr = Err; } catch (...) { - errcode_ret = UR_RESULT_ERROR_OUT_OF_RESOURCES; + RetErr = UR_RESULT_ERROR_OUT_OF_RESOURCES; } - return errcode_ret; + return RetErr; } -UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( - ur_context_handle_t hContext, ur_context_info_t ContextInfoType, - size_t propSize, void *pContextInfo, size_t *pPropSizeRet) { +UR_APIEXPORT ur_result_t UR_APICALL +urContextGetInfo(ur_context_handle_t hContext, ur_context_info_t propName, + size_t propSize, void *pPropValue, size_t *pPropSizeRet) { UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UrReturnHelper ReturnValue(propSize, pContextInfo, pPropSizeRet); + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - switch (uint32_t{ContextInfoType}) { + switch (uint32_t{propName}) { case UR_CONTEXT_INFO_NUM_DEVICES: return ReturnValue(1); case UR_CONTEXT_INFO_DEVICES: - return ReturnValue(hContext->get_device()); + return ReturnValue(hContext->getDevice()); case UR_CONTEXT_INFO_REFERENCE_COUNT: - return ReturnValue(hContext->get_reference_count()); + return ReturnValue(hContext->getReferenceCount()); case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: case UR_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: @@ -102,51 +103,53 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( return UR_RESULT_ERROR_INVALID_ENUMERATION; } -UR_APIEXPORT ur_result_t UR_APICALL urContextRelease(ur_context_handle_t ctxt) { - UR_ASSERT(ctxt, UR_RESULT_ERROR_INVALID_NULL_HANDLE); +UR_APIEXPORT ur_result_t UR_APICALL +urContextRelease(ur_context_handle_t hContext) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - if (ctxt->decrement_reference_count() > 0) { + if (hContext->decrementReferenceCount() > 0) { return UR_RESULT_SUCCESS; } - ctxt->invoke_extended_deleters(); + hContext->invokeExtendedDeleters(); - std::unique_ptr context{ctxt}; + std::unique_ptr context{hContext}; - if (!ctxt->is_primary()) { - hipCtx_t hipCtxt = ctxt->get(); + if (!hContext->isPrimary()) { + hipCtx_t HIPCtxt = hContext->get(); // hipCtxSynchronize is not supported for AMD platform so we can just // destroy the context, for NVIDIA make sure it's synchronized. #if defined(__HIP_PLATFORM_NVIDIA__) - hipCtx_t current = nullptr; - UR_CHECK_ERROR(hipCtxGetCurrent(¤t)); - if (hipCtxt != current) { - UR_CHECK_ERROR(hipCtxPushCurrent(hipCtxt)); + hipCtx_t Current = nullptr; + UR_CHECK_ERROR(hipCtxGetCurrent(&Current)); + if (HIPCtxt != Current) { + UR_CHECK_ERROR(hipCtxPushCurrent(HIPCtxt)); } UR_CHECK_ERROR(hipCtxSynchronize()); - UR_CHECK_ERROR(hipCtxGetCurrent(¤t)); - if (hipCtxt == current) { - UR_CHECK_ERROR(hipCtxPopCurrent(¤t)); + UR_CHECK_ERROR(hipCtxGetCurrent(&Current)); + if (HIPCtxt == Current) { + UR_CHECK_ERROR(hipCtxPopCurrent(&Current)); } #endif - return UR_CHECK_ERROR(hipCtxDestroy(hipCtxt)); + return UR_CHECK_ERROR(hipCtxDestroy(HIPCtxt)); } else { // Primary context is not destroyed, but released - hipDevice_t hipDev = ctxt->get_device()->get(); - hipCtx_t current; - UR_CHECK_ERROR(hipCtxPopCurrent(¤t)); - return UR_CHECK_ERROR(hipDevicePrimaryCtxRelease(hipDev)); + hipDevice_t HIPDev = hContext->getDevice()->get(); + hipCtx_t Current; + UR_CHECK_ERROR(hipCtxPopCurrent(&Current)); + return UR_CHECK_ERROR(hipDevicePrimaryCtxRelease(HIPDev)); } - hipCtx_t hipCtxt = ctxt->get(); - return UR_CHECK_ERROR(hipCtxDestroy(hipCtxt)); + hipCtx_t HIPCtxt = hContext->get(); + return UR_CHECK_ERROR(hipCtxDestroy(HIPCtxt)); } -UR_APIEXPORT ur_result_t UR_APICALL urContextRetain(ur_context_handle_t ctxt) { - UR_ASSERT(ctxt, UR_RESULT_ERROR_INVALID_NULL_HANDLE); +UR_APIEXPORT ur_result_t UR_APICALL +urContextRetain(ur_context_handle_t hContext) { + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - assert(ctxt->get_reference_count() > 0); + assert(hContext->getReferenceCount() > 0); - ctxt->increment_reference_count(); + hContext->incrementReferenceCount(); return UR_RESULT_SUCCESS; } @@ -164,12 +167,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( const ur_device_handle_t *phDevices, const ur_context_native_properties_t *pProperties, ur_context_handle_t *phContext) { - (void)hNativeContext; - (void)phContext; - - // TODO(ur): Needed for the conformance test to pass, but it may be valid - // to have a null CUDA context - UR_ASSERT(hNativeContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + std::ignore = hNativeContext; + std::ignore = numDevices; + std::ignore = phDevices; + std::ignore = pProperties; + std::ignore = phContext; return UR_RESULT_ERROR_INVALID_OPERATION; } @@ -180,6 +182,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter( UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pfnDeleter, UR_RESULT_ERROR_INVALID_NULL_POINTER); - hContext->set_extended_deleter(pfnDeleter, pUserData); + hContext->setExtendedDeleter(pfnDeleter, pUserData); return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp index 98b8702213f2e..aa61e1e84b4aa 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/context.hpp @@ -11,12 +11,12 @@ #include "device.hpp" #include "platform.hpp" -typedef void (*ur_context_extended_deleter_t)(void *user_data); +typedef void (*ur_context_extended_deleter_t)(void *UserData); /// UR context mapping to a HIP context object. /// -/// There is no direct mapping between a HIP context and a UR context, -/// main differences described below: +/// There is no direct mapping between a HIP context and a UR context. +/// The main differences are described below: /// /// HIP context vs UR context /// @@ -26,22 +26,21 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data); /// with a given device and control access to said device from the user side. /// UR API context are objects that are passed to functions, and not bound /// to threads. -/// The ur_context_handle_t_ object doesn't implement this behavior, only holds the -/// HIP context data. The RAII object \ref ScopedContext implements the active -/// context behavior. +/// The ur_context_handle_t_ object doesn't implement this behavior. It only +/// holds the HIP context data. The RAII object \ref ScopedContext implements +/// the active context behavior. /// -/// Primary vs User-defined context +/// Primary vs UserDefined context /// /// HIP has two different types of context, the Primary context, /// which is usable by all threads on a given process for a given device, and /// the aforementioned custom contexts. -/// HIP documentation, and performance analysis, indicates it is recommended -/// to use Primary context whenever possible. -/// Primary context is used as well by the HIP Runtime API. -/// For UR applications to interop with HIP Runtime API, they have to use -/// the primary context - and make that active in the thread. -/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter -/// that allows to construct a Primary or `user-defined` context, so that +/// The HIP documentation, and performance analysis, suggest using the Primary +/// context whenever possible. The Primary context is also used by the HIP +/// Runtime API. For UR applications to interop with HIP Runtime API, they have +/// to use the primary context - and make that active in the thread. The +/// `ur_context_handle_t_` object can be constructed with a `kind` parameter +/// that allows to construct a Primary or `UserDefined` context, so that /// the UR object interface is always the same. /// /// Destructor callback @@ -50,59 +49,60 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data); /// the UR Context can store a number of callback functions that will be /// called upon destruction of the UR Context. /// See proposal for details. +/// https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md /// struct ur_context_handle_t_ { struct deleter_data { - ur_context_extended_deleter_t function; - void *user_data; + ur_context_extended_deleter_t Function; + void *UserData; - void operator()() { function(user_data); } + void operator()() { Function(UserData); } }; using native_type = hipCtx_t; - enum class kind { primary, user_defined } kind_; - native_type hipContext_; - ur_device_handle_t deviceId_; - std::atomic_uint32_t refCount_; + enum class kind { Primary, UserDefined } Kind; + native_type HIPContext; + ur_device_handle_t DeviceId; + std::atomic_uint32_t RefCount; - ur_context_handle_t_(kind k, hipCtx_t ctxt, ur_device_handle_t devId) - : kind_{k}, hipContext_{ctxt}, deviceId_{devId}, refCount_{1} { - deviceId_->set_context(this); - urDeviceRetain(deviceId_); + ur_context_handle_t_(kind K, hipCtx_t Ctxt, ur_device_handle_t DevId) + : Kind{K}, HIPContext{Ctxt}, DeviceId{DevId}, RefCount{1} { + DeviceId->setContext(this); + urDeviceRetain(DeviceId); }; - ~ur_context_handle_t_() { urDeviceRelease(deviceId_); } + ~ur_context_handle_t_() { urDeviceRelease(DeviceId); } - void invoke_extended_deleters() { - std::lock_guard guard(mutex_); - for (auto &deleter : extended_deleters_) { - deleter(); + void invokeExtendedDeleters() { + std::lock_guard Guard(Mutex); + for (auto &Deleter : ExtendedDeleters) { + Deleter(); } } - void set_extended_deleter(ur_context_extended_deleter_t function, - void *user_data) { - std::lock_guard guard(mutex_); - extended_deleters_.emplace_back(deleter_data{function, user_data}); + void setExtendedDeleter(ur_context_extended_deleter_t Function, + void *UserData) { + std::lock_guard Guard(Mutex); + ExtendedDeleters.emplace_back(deleter_data{Function, UserData}); } - ur_device_handle_t get_device() const noexcept { return deviceId_; } + ur_device_handle_t getDevice() const noexcept { return DeviceId; } - native_type get() const noexcept { return hipContext_; } + native_type get() const noexcept { return HIPContext; } - bool is_primary() const noexcept { return kind_ == kind::primary; } + bool isPrimary() const noexcept { return Kind == kind::Primary; } - uint32_t increment_reference_count() noexcept { return ++refCount_; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - uint32_t decrement_reference_count() noexcept { return --refCount_; } + uint32_t decrementReferenceCount() noexcept { return --RefCount; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } private: - std::mutex mutex_; - std::vector extended_deleters_; + std::mutex Mutex; + std::vector ExtendedDeleters; }; namespace { @@ -113,24 +113,24 @@ namespace { /// API is the one active on the thread. /// The implementation tries to avoid replacing the hipCtx_t if it cans class ScopedContext { - ur_context_handle_t placedContext_; - hipCtx_t original_; - bool needToRecover_; + ur_context_handle_t PlacedContext; + hipCtx_t Original; + bool NeedToRecover; public: - ScopedContext(ur_context_handle_t ctxt) - : placedContext_{ctxt}, needToRecover_{false} { + ScopedContext(ur_context_handle_t Ctxt) + : PlacedContext{Ctxt}, NeedToRecover{false} { - if (!placedContext_) { + if (!PlacedContext) { throw UR_RESULT_ERROR_INVALID_CONTEXT; } - hipCtx_t desired = placedContext_->get(); - UR_CHECK_ERROR(hipCtxGetCurrent(&original_)); - if (original_ != desired) { + hipCtx_t Desired = PlacedContext->get(); + UR_CHECK_ERROR(hipCtxGetCurrent(&Original)); + if (Original != Desired) { // Sets the desired context as the active one for the thread - UR_CHECK_ERROR(hipCtxSetCurrent(desired)); - if (original_ == nullptr) { + UR_CHECK_ERROR(hipCtxSetCurrent(Desired)); + if (Original == nullptr) { // No context is installed on the current thread // This is the most common case. We can activate the context in the // thread and leave it there until all the UR context referring to the @@ -138,14 +138,14 @@ class ScopedContext { // the behaviour of the HIP runtime api, and avoids costly context // switches. No action is required on this side of the if. } else { - needToRecover_ = true; + NeedToRecover = true; } } } ~ScopedContext() { - if (needToRecover_) { - UR_CHECK_ERROR(hipCtxSetCurrent(original_)); + if (NeedToRecover) { + UR_CHECK_ERROR(hipCtxSetCurrent(Original)); } } }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index c479644e87bef..3e4aab8f1f0aa 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -12,113 +12,113 @@ #include -int getAttribute(ur_device_handle_t device, hipDeviceAttribute_t attribute) { - int value; +int getAttribute(ur_device_handle_t Device, hipDeviceAttribute_t Attribute) { + int Value; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&value, attribute, device->get()) == hipSuccess); - return value; + hipDeviceGetAttribute(&Value, Attribute, Device->get()) == hipSuccess); + return Value; } -UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, - ur_device_info_t infoType, +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + ur_device_info_t propName, size_t propSize, - void *pDeviceInfo, + void *pPropValue, size_t *pPropSizeRet) { - UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UrReturnHelper ReturnValue(propSize, pDeviceInfo, pPropSizeRet); + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - static constexpr uint32_t max_work_item_dimensions = 3u; + static constexpr uint32_t MaxWorkItemDimensions = 3u; - switch ((uint32_t)infoType) { + switch ((uint32_t)propName) { case UR_DEVICE_INFO_TYPE: { return ReturnValue(UR_DEVICE_TYPE_GPU); } case UR_DEVICE_INFO_VENDOR_ID: { #if defined(__HIP_PLATFORM_AMD__) - uint32_t vendor_id = 4098u; + uint32_t VendorId = 4098u; #elif defined(__HIP_PLATFORM_NVIDIA__) - uint32_t vendor_id = 4318u; + uint32_t VendorId = 4318u; #else - uint32_t vendor_id = 0u; + uint32_t VendorId = 0u; #endif - return ReturnValue(vendor_id); + return ReturnValue(VendorId); } case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { - int compute_units = 0; + int ComputeUnits = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&compute_units, + hipDeviceGetAttribute(&ComputeUnits, hipDeviceAttributeMultiprocessorCount, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(compute_units >= 0); - return ReturnValue(static_cast(compute_units)); + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(ComputeUnits >= 0); + return ReturnValue(static_cast(ComputeUnits)); } case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { - return ReturnValue(max_work_item_dimensions); + return ReturnValue(MaxWorkItemDimensions); } case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { struct { - size_t sizes[max_work_item_dimensions]; + size_t sizes[MaxWorkItemDimensions]; } return_sizes; - int max_x = 0, max_y = 0, max_z = 0; + int MaxX = 0, MaxY = 0, MaxZ = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxBlockDimX, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(max_x >= 0); + hipDeviceGetAttribute(&MaxX, hipDeviceAttributeMaxBlockDimX, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(MaxX >= 0); sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxBlockDimY, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(max_y >= 0); + hipDeviceGetAttribute(&MaxY, hipDeviceAttributeMaxBlockDimY, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(MaxY >= 0); sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxBlockDimZ, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(max_z >= 0); + hipDeviceGetAttribute(&MaxZ, hipDeviceAttributeMaxBlockDimZ, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(MaxZ >= 0); - return_sizes.sizes[0] = size_t(max_x); - return_sizes.sizes[1] = size_t(max_y); - return_sizes.sizes[2] = size_t(max_z); + return_sizes.sizes[0] = size_t(MaxX); + return_sizes.sizes[1] = size_t(MaxY); + return_sizes.sizes[2] = size_t(MaxZ); return ReturnValue(return_sizes); } case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D: { struct { - size_t sizes[max_work_item_dimensions]; + size_t sizes[MaxWorkItemDimensions]; } return_sizes; - int max_x = 0, max_y = 0, max_z = 0; + int MaxX = 0, MaxY = 0, MaxZ = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxGridDimX, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(max_x >= 0); + hipDeviceGetAttribute(&MaxX, hipDeviceAttributeMaxGridDimX, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(MaxX >= 0); sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxGridDimY, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(max_y >= 0); + hipDeviceGetAttribute(&MaxY, hipDeviceAttributeMaxGridDimY, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(MaxY >= 0); sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxGridDimZ, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(max_z >= 0); + hipDeviceGetAttribute(&MaxZ, hipDeviceAttributeMaxGridDimZ, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(MaxZ >= 0); - return_sizes.sizes[0] = size_t(max_x); - return_sizes.sizes[1] = size_t(max_y); - return_sizes.sizes[2] = size_t(max_z); + return_sizes.sizes[0] = size_t(MaxX); + return_sizes.sizes[1] = size_t(MaxY); + return_sizes.sizes[2] = size_t(MaxZ); return ReturnValue(return_sizes); } case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { - int max_work_group_size = 0; + int MaxWorkGroupSize = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_work_group_size, + hipDeviceGetAttribute(&MaxWorkGroupSize, hipDeviceAttributeMaxThreadsPerBlock, - device->get()) == hipSuccess); + hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(max_work_group_size >= 0); + sycl::detail::ur::assertion(MaxWorkGroupSize >= 0); - return ReturnValue(size_t(max_work_group_size)); + return ReturnValue(size_t(MaxWorkGroupSize)); } case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: { return ReturnValue(1u); @@ -164,47 +164,46 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, } case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { // Number of sub-groups = max block size / warp size + possible remainder - int max_threads = 0; + int MaxThreads = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_threads, - hipDeviceAttributeMaxThreadsPerBlock, - device->get()) == hipSuccess); - int warpSize = 0; + hipDeviceGetAttribute(&MaxThreads, hipDeviceAttributeMaxThreadsPerBlock, + hDevice->get()) == hipSuccess); + int WarpSize = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, - device->get()) == hipSuccess); - int maxWarps = (max_threads + warpSize - 1) / warpSize; - return ReturnValue(maxWarps); + hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, + hDevice->get()) == hipSuccess); + int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; + return ReturnValue(MaxWarps); } case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { // Volta provides independent thread scheduling // TODO: Revisit for previous generation GPUs - int major = 0; + int Major = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, - device->get()) == hipSuccess); - bool ifp = (major >= 7); - return ReturnValue(ifp); + hipDeviceGetAttribute(&Major, hipDeviceAttributeComputeCapabilityMajor, + hDevice->get()) == hipSuccess); + bool IFP = (Major >= 7); + return ReturnValue(IFP); } case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { - int warpSize = 0; + int WarpSize = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, - device->get()) == hipSuccess); - size_t sizes[1] = {static_cast(warpSize)}; - return ReturnValue(sizes, 1); + hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, + hDevice->get()) == hipSuccess); + size_t Sizes[1] = {static_cast(WarpSize)}; + return ReturnValue(Sizes, 1); } case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { - int clock_freq = 0; + int ClockFreq = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&clock_freq, hipDeviceAttributeClockRate, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(clock_freq >= 0); - return ReturnValue(static_cast(clock_freq) / 1000u); + hipDeviceGetAttribute(&ClockFreq, hipDeviceAttributeClockRate, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(ClockFreq >= 0); + return ReturnValue(static_cast(ClockFreq) / 1000u); } case UR_DEVICE_INFO_ADDRESS_BITS: { - auto bits = uint32_t{std::numeric_limits::digits}; - return ReturnValue(bits); + auto Bits = uint32_t{std::numeric_limits::digits}; + return ReturnValue(Bits); } case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: { // Max size of memory object allocation in bytes. @@ -213,16 +212,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // 32 × 1024 × 1024) for devices that are not of type // CL_DEVICE_TYPE_CUSTOM. - size_t global = 0; - sycl::detail::ur::assertion(hipDeviceTotalMem(&global, device->get()) == + size_t Global = 0; + sycl::detail::ur::assertion(hipDeviceTotalMem(&Global, hDevice->get()) == hipSuccess); - auto quarter_global = static_cast(global / 4u); + auto QuarterGlobal = static_cast(Global / 4u); - auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global), - 32u * 1024u * 1024u); + auto MaxAlloc = std::max(std::min(1024u * 1024u * 1024u, QuarterGlobal), + 32u * 1024u * 1024u); - return ReturnValue(uint64_t{max_alloc}); + return ReturnValue(uint64_t{MaxAlloc}); } case UR_DEVICE_INFO_IMAGE_SUPPORTED: { return ReturnValue(uint32_t{true}); @@ -241,107 +240,105 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, } case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: { // Take the smaller of maximum surface and maximum texture height. - int tex_height = 0; + int TexHeight = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture2DHeight, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(tex_height >= 0); - int surf_height = 0; + hipDeviceGetAttribute(&TexHeight, hipDeviceAttributeMaxTexture2DHeight, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(TexHeight >= 0); + int SurfHeight = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&surf_height, - hipDeviceAttributeMaxTexture2DHeight, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(surf_height >= 0); + hipDeviceGetAttribute(&SurfHeight, hipDeviceAttributeMaxTexture2DHeight, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(SurfHeight >= 0); - int min = std::min(tex_height, surf_height); + int Min = std::min(TexHeight, SurfHeight); - return ReturnValue(static_cast(min)); + return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: { // Take the smaller of maximum surface and maximum texture width. - int tex_width = 0; + int TexWidth = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture2DWidth, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(tex_width >= 0); - int surf_width = 0; + hipDeviceGetAttribute(&TexWidth, hipDeviceAttributeMaxTexture2DWidth, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(TexWidth >= 0); + int SurfWidth = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture2DWidth, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(surf_width >= 0); + hipDeviceGetAttribute(&SurfWidth, hipDeviceAttributeMaxTexture2DWidth, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(SurfWidth >= 0); - int min = std::min(tex_width, surf_width); + int Min = std::min(TexWidth, SurfWidth); - return ReturnValue(static_cast(min)); + return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: { // Take the smaller of maximum surface and maximum texture height. - int tex_height = 0; + int TexHeight = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture3DHeight, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(tex_height >= 0); - int surf_height = 0; + hipDeviceGetAttribute(&TexHeight, hipDeviceAttributeMaxTexture3DHeight, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(TexHeight >= 0); + int SurfHeight = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&surf_height, - hipDeviceAttributeMaxTexture3DHeight, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(surf_height >= 0); + hipDeviceGetAttribute(&SurfHeight, hipDeviceAttributeMaxTexture3DHeight, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(SurfHeight >= 0); - int min = std::min(tex_height, surf_height); + int Min = std::min(TexHeight, SurfHeight); - return ReturnValue(static_cast(min)); + return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: { // Take the smaller of maximum surface and maximum texture width. - int tex_width = 0; + int TexWidth = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture3DWidth, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(tex_width >= 0); - int surf_width = 0; + hipDeviceGetAttribute(&TexWidth, hipDeviceAttributeMaxTexture3DWidth, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(TexWidth >= 0); + int SurfWidth = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture3DWidth, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(surf_width >= 0); + hipDeviceGetAttribute(&SurfWidth, hipDeviceAttributeMaxTexture3DWidth, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(SurfWidth >= 0); - int min = std::min(tex_width, surf_width); + int Min = std::min(TexWidth, SurfWidth); - return ReturnValue(static_cast(min)); + return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: { // Take the smaller of maximum surface and maximum texture depth. - int tex_depth = 0; + int TexDepth = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&tex_depth, hipDeviceAttributeMaxTexture3DDepth, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(tex_depth >= 0); - int surf_depth = 0; + hipDeviceGetAttribute(&TexDepth, hipDeviceAttributeMaxTexture3DDepth, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(TexDepth >= 0); + int SurfDepth = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&surf_depth, hipDeviceAttributeMaxTexture3DDepth, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(surf_depth >= 0); + hipDeviceGetAttribute(&SurfDepth, hipDeviceAttributeMaxTexture3DDepth, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(SurfDepth >= 0); - int min = std::min(tex_depth, surf_depth); + int Min = std::min(TexDepth, SurfDepth); - return ReturnValue(static_cast(min)); + return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: { // Take the smaller of maximum surface and maximum texture width. - int tex_width = 0; + int TexWidth = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture1DWidth, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(tex_width >= 0); - int surf_width = 0; + hipDeviceGetAttribute(&TexWidth, hipDeviceAttributeMaxTexture1DWidth, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(TexWidth >= 0); + int SurfWidth = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture1DWidth, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(surf_width >= 0); + hipDeviceGetAttribute(&SurfWidth, hipDeviceAttributeMaxTexture1DWidth, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(SurfWidth >= 0); - int min = std::min(tex_width, surf_width); + int Min = std::min(TexWidth, SurfWidth); - return ReturnValue(static_cast(min)); + return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: { return ReturnValue(0lu); @@ -357,20 +354,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue(4000lu); } case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { - int mem_base_addr_align = 0; + int MemBaseAddrAlign = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&mem_base_addr_align, + hipDeviceGetAttribute(&MemBaseAddrAlign, hipDeviceAttributeTextureAlignment, - device->get()) == hipSuccess); + hDevice->get()) == hipSuccess); // Multiply by 8 as clGetDeviceInfo returns this value in bits - mem_base_addr_align *= 8; - return ReturnValue(mem_base_addr_align); + MemBaseAddrAlign *= 8; + return ReturnValue(MemBaseAddrAlign); } case UR_DEVICE_INFO_HALF_FP_CONFIG: { return ReturnValue(0u); } case UR_DEVICE_INFO_SINGLE_FP_CONFIG: { - uint64_t config = + uint64_t Config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | @@ -378,16 +375,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | UR_DEVICE_FP_CAPABILITY_FLAG_FMA | UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; - return ReturnValue(config); + return ReturnValue(Config); } case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: { - uint64_t config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | + uint64_t Config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | UR_DEVICE_FP_CAPABILITY_FLAG_FMA; - return ReturnValue(config); + return ReturnValue(Config); } case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: { return ReturnValue(UR_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE); @@ -398,35 +395,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue(128u); } case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: { - int cache_size = 0; + int CacheSize = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&cache_size, hipDeviceAttributeL2CacheSize, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(cache_size >= 0); + hipDeviceGetAttribute(&CacheSize, hipDeviceAttributeL2CacheSize, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(CacheSize >= 0); // The L2 cache is global to the GPU. - return ReturnValue(static_cast(cache_size)); + return ReturnValue(static_cast(CacheSize)); } case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { - size_t bytes = 0; + size_t Bytes = 0; // Runtime API has easy access to this value, driver API info is scarse. - sycl::detail::ur::assertion(hipDeviceTotalMem(&bytes, device->get()) == + sycl::detail::ur::assertion(hipDeviceTotalMem(&Bytes, hDevice->get()) == hipSuccess); - return ReturnValue(uint64_t{bytes}); + return ReturnValue(uint64_t{Bytes}); } case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { - int constant_memory = 0; + int ConstantMemory = 0; // hipDeviceGetAttribute takes a int*, however the size of the constant // memory on AMD GPU may be larger than what can fit in the positive part // of a signed integer, so use an unsigned integer and cast the pointer to // int*. sycl::detail::ur::assertion( - hipDeviceGetAttribute(&constant_memory, + hipDeviceGetAttribute(&ConstantMemory, hipDeviceAttributeTotalConstantMemory, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(constant_memory >= 0); + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(ConstantMemory >= 0); - return ReturnValue(static_cast(constant_memory)); + return ReturnValue(static_cast(ConstantMemory)); } case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: { // TODO: is there a way to retrieve this from HIP driver API? @@ -441,33 +438,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // OpenCL's "local memory" maps most closely to HIP's "shared memory". // HIP has its own definition of "local memory", which maps to OpenCL's // "private memory". - int local_mem_size = 0; + int LocalMemSize = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&local_mem_size, + hipDeviceGetAttribute(&LocalMemSize, hipDeviceAttributeMaxSharedMemoryPerBlock, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(local_mem_size >= 0); - return ReturnValue(static_cast(local_mem_size)); + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(LocalMemSize >= 0); + return ReturnValue(static_cast(LocalMemSize)); } case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { - int ecc_enabled = 0; + int EccEnabled = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&ecc_enabled, hipDeviceAttributeEccEnabled, - device->get()) == hipSuccess); + hipDeviceGetAttribute(&EccEnabled, hipDeviceAttributeEccEnabled, + hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion((ecc_enabled == 0) | (ecc_enabled == 1)); - auto result = static_cast(ecc_enabled); - return ReturnValue(result); + sycl::detail::ur::assertion((EccEnabled == 0) | (EccEnabled == 1)); + auto Result = static_cast(EccEnabled); + return ReturnValue(Result); } case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: { - int is_integrated = 0; + int IsIntegrated = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&is_integrated, hipDeviceAttributeIntegrated, - device->get()) == hipSuccess); + hipDeviceGetAttribute(&IsIntegrated, hipDeviceAttributeIntegrated, + hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion((is_integrated == 0) | (is_integrated == 1)); - auto result = static_cast(is_integrated); - return ReturnValue(result); + sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); + auto Result = static_cast(IsIntegrated); + return ReturnValue(Result); } case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: { // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX @@ -490,21 +487,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue(true); } case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: { - auto capability = ur_device_exec_capability_flags_t{ + auto Capability = ur_device_exec_capability_flags_t{ UR_DEVICE_EXEC_CAPABILITY_FLAG_KERNEL}; - return ReturnValue(capability); + return ReturnValue(Capability); } case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: { // The mandated minimum capability: - uint64_t capability = UR_QUEUE_FLAG_PROFILING_ENABLE | + uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE | UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; - return ReturnValue(capability); + return ReturnValue(Capability); } case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: case UR_DEVICE_INFO_QUEUE_PROPERTIES: { // The mandated minimum capability: - uint64_t capability = UR_QUEUE_FLAG_PROFILING_ENABLE; - return ReturnValue(capability); + uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE; + return ReturnValue(Capability); } case UR_DEVICE_INFO_BUILT_IN_KERNELS: { // An empty string is returned if no built-in kernels are supported by the @@ -512,23 +509,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue(""); } case UR_DEVICE_INFO_PLATFORM: { - return ReturnValue(device->get_platform()); + return ReturnValue(hDevice->getPlatform()); } case UR_DEVICE_INFO_NAME: { static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u; - char name[MAX_DEVICE_NAME_LENGTH]; - sycl::detail::ur::assertion(hipDeviceGetName(name, MAX_DEVICE_NAME_LENGTH, - device->get()) == hipSuccess); + char Name[MAX_DEVICE_NAME_LENGTH]; + sycl::detail::ur::assertion(hipDeviceGetName(Name, MAX_DEVICE_NAME_LENGTH, + hDevice->get()) == hipSuccess); // On AMD GPUs hipDeviceGetName returns an empty string, so return the arch // name instead, this is also what AMD OpenCL devices return. - if (strlen(name) == 0) { - hipDeviceProp_t props; + if (strlen(Name) == 0) { + hipDeviceProp_t Props; sycl::detail::ur::assertion( - hipGetDeviceProperties(&props, device->get()) == hipSuccess); + hipGetDeviceProperties(&Props, hDevice->get()) == hipSuccess); - return ReturnValue(props.gcnArchName, strlen(props.gcnArchName) + 1); + return ReturnValue(Props.gcnArchName, strlen(Props.gcnArchName) + 1); } - return ReturnValue(name, strlen(name) + 1); + return ReturnValue(Name, strlen(Name) + 1); } case UR_DEVICE_INFO_VENDOR: { return ReturnValue("AMD Corporation"); @@ -541,22 +538,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, return ReturnValue("HIP"); } case UR_DEVICE_INFO_REFERENCE_COUNT: { - return ReturnValue(device->get_reference_count()); + return ReturnValue(hDevice->getReferenceCount()); } case UR_DEVICE_INFO_VERSION: { - std::stringstream s; + std::stringstream S; - hipDeviceProp_t props; - sycl::detail::ur::assertion(hipGetDeviceProperties(&props, device->get()) == - hipSuccess); + hipDeviceProp_t Props; + sycl::detail::ur::assertion( + hipGetDeviceProperties(&Props, hDevice->get()) == hipSuccess); #if defined(__HIP_PLATFORM_NVIDIA__) - s << props.major << "." << props.minor; + S << Props.major << "." << Props.minor; #elif defined(__HIP_PLATFORM_AMD__) - s << props.gcnArchName; + S << Props.gcnArchName; #else #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); #endif - return ReturnValue(s.str().c_str()); + return ReturnValue(S.str().c_str()); } case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: { return ReturnValue(""); @@ -570,11 +567,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, SupportedExtensions += "pi_ext_intel_devicelib_assert "; SupportedExtensions += " "; - hipDeviceProp_t props; - sycl::detail::ur::assertion(hipGetDeviceProperties(&props, device->get()) == - hipSuccess); + hipDeviceProp_t Props; + sycl::detail::ur::assertion( + hipGetDeviceProperties(&Props, hDevice->get()) == hipSuccess); - if (props.arch.hasDoubles) { + if (Props.arch.hasDoubles) { SupportedExtensions += "cl_khr_fp64 "; } @@ -610,13 +607,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // // query if/how the device can access page-locked host memory, possibly // through PCIe, using the same pointer as the host - uint64_t value = {}; + uint64_t Value = {}; // if (getAttribute(device, HIP_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) { // the device shares a unified address space with the host - if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) { + if (getAttribute(hDevice, hipDeviceAttributeComputeCapabilityMajor) >= 6) { // compute capability 6.x introduces operations that are atomic with // respect to other CPUs and GPUs in the system - value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; @@ -624,10 +621,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // on GPU architectures with compute capability lower than 6.x, atomic // operations from the GPU to CPU memory will not be atomic with respect // to CPU initiated atomic operations - value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; } - return ReturnValue(value); + return ReturnValue(Value); } case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: { // from cl_intel_unified_shared_memory: @@ -635,12 +632,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // associated with this device." // // query how the device can access memory allocated on the device itself (?) - uint64_t value = + uint64_t Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; - return ReturnValue(value); + return ReturnValue(Value); } case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: { // from cl_intel_unified_shared_memory: @@ -648,23 +645,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // allocation associated with this device." // // query if/how the device can access managed memory associated to it - uint64_t value = {}; - if (getAttribute(device, hipDeviceAttributeManagedMemory)) { + uint64_t Value = {}; + if (getAttribute(hDevice, hipDeviceAttributeManagedMemory)) { // the device can allocate managed memory on this system - value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; } - if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) { + if (getAttribute(hDevice, hipDeviceAttributeConcurrentManagedAccess)) { // the device can coherently access managed memory concurrently with the // CPU - value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; - if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) { + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + if (getAttribute(hDevice, hipDeviceAttributeComputeCapabilityMajor) >= + 6) { // compute capability 6.x introduces operations that are atomic with // respect to other CPUs and GPUs in the system - value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; } } - return ReturnValue(value); + return ReturnValue(Value); } case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: { // from cl_intel_unified_shared_memory: @@ -675,26 +673,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // // query if/how the device can access managed memory associated to other // devices - uint64_t value = {}; - if (getAttribute(device, hipDeviceAttributeManagedMemory)) { + uint64_t Value = {}; + if (getAttribute(hDevice, hipDeviceAttributeManagedMemory)) { // the device can allocate managed memory on this system - value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS; + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS; } - if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) { + if (getAttribute(hDevice, hipDeviceAttributeConcurrentManagedAccess)) { // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS // attribute can coherently access managed memory concurrently with the // CPU - value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; } - if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) { + if (getAttribute(hDevice, hipDeviceAttributeComputeCapabilityMajor) >= 6) { // compute capability 6.x introduces operations that are atomic with // respect to other CPUs and GPUs in the system - if (value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS) - value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; - if (value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS) - value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + if (Value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS) + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; + if (Value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS) + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; } - return ReturnValue(value); + return ReturnValue(Value); } case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { // from cl_intel_unified_shared_memory: @@ -703,32 +701,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // // query if/how the device can access pageable host memory allocated by the // system allocator - uint64_t value = {}; - if (getAttribute(device, hipDeviceAttributePageableMemoryAccess)) { + uint64_t Value = {}; + if (getAttribute(hDevice, hipDeviceAttributePageableMemoryAccess)) { // the link between the device and the host does not support native // atomic operations - value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; } - return ReturnValue(value); + return ReturnValue(Value); } case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: { - int major = 0, minor = 0; + int Major = 0, Minor = 0; sycl::detail::ur::assertion( - hipDeviceComputeCapability(&major, &minor, device->get()) == + hipDeviceComputeCapability(&Major, &Minor, hDevice->get()) == hipSuccess); - std::string result = std::to_string(major) + "." + std::to_string(minor); - return ReturnValue(result.c_str()); + std::string Result = std::to_string(Major) + "." + std::to_string(Minor); + return ReturnValue(Result.c_str()); } case UR_DEVICE_INFO_ATOMIC_64: { // TODO: Reconsider it when AMD supports SYCL_USE_NATIVE_FP_ATOMICS. - hipDeviceProp_t props; - sycl::detail::ur::assertion(hipGetDeviceProperties(&props, device->get()) == - hipSuccess); - return ReturnValue(props.arch.hasGlobalInt64Atomics && - props.arch.hasSharedInt64Atomics); + hipDeviceProp_t Props; + sycl::detail::ur::assertion( + hipGetDeviceProperties(&Props, hDevice->get()) == hipSuccess); + return ReturnValue(Props.arch.hasGlobalInt64Atomics && + Props.arch.hasSharedInt64Atomics); } case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { @@ -741,32 +739,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, } case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { - int value = 0; + int Value = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&value, hipDeviceAttributeMemoryClockRate, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(value >= 0); + hipDeviceGetAttribute(&Value, hipDeviceAttributeMemoryClockRate, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(Value >= 0); // Convert kilohertz to megahertz when returning. - return ReturnValue(value / 1000); + return ReturnValue(Value / 1000); } case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: { - int value = 0; + int Value = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&value, hipDeviceAttributeMemoryBusWidth, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(value >= 0); - return ReturnValue(value); + hipDeviceGetAttribute(&Value, hipDeviceAttributeMemoryBusWidth, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(Value >= 0); + return ReturnValue(Value); } case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { return ReturnValue(int32_t{1}); } case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { - uint64_t capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + uint64_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE; - return ReturnValue(capabilities); + return ReturnValue(Capabilities); } case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { @@ -775,39 +773,39 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // Because scopes are hierarchical, wider scopes support all narrower // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382) - uint64_t capabilities = UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + uint64_t Capabilities = UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP; - return ReturnValue(capabilities); + return ReturnValue(Capabilities); } case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { // SYCL2020 4.6.4.2 minimum mandated capabilities for // atomic_fence_order_capabilities. - ur_memory_order_capability_flags_t capabilities = + ur_memory_order_capability_flags_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; - return ReturnValue(capabilities); + return ReturnValue(Capabilities); } case UR_DEVICE_INFO_DEVICE_ID: { - int value = 0; + int Value = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&value, hipDeviceAttributePciDeviceId, - device->get()) == hipSuccess); - sycl::detail::ur::assertion(value >= 0); - return ReturnValue(value); + hipDeviceGetAttribute(&Value, hipDeviceAttributePciDeviceId, + hDevice->get()) == hipSuccess); + sycl::detail::ur::assertion(Value >= 0); + return ReturnValue(Value); } case UR_DEVICE_INFO_UUID: { #if ((HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 2) || \ HIP_VERSION_MAJOR > 5) - hipUUID uuid = {}; + hipUUID UUID = {}; // Supported since 5.2+ - sycl::detail::ur::assertion(hipDeviceGetUuid(&uuid, device->get()) == + sycl::detail::ur::assertion(hipDeviceGetUuid(&UUID, hDevice->get()) == hipSuccess); - std::array name; - std::copy(uuid.bytes, uuid.bytes + 16, name.begin()); - return ReturnValue(name.data(), 16); + std::array Name; + std::copy(UUID.bytes, UUID.bytes + 16, Name.begin()); + return ReturnValue(Name.data(), 16); #endif return UR_RESULT_ERROR_INVALID_VALUE; } @@ -815,13 +813,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, // Maximum number of 32-bit registers available to a thread block. // Note: This number is shared by all thread blocks simultaneously resident // on a multiprocessor. - int max_registers{-1}; + int MaxRegisters{-1}; UR_CHECK_ERROR(hipDeviceGetAttribute( - &max_registers, hipDeviceAttributeMaxRegistersPerBlock, device->get())); + &MaxRegisters, hipDeviceAttributeMaxRegistersPerBlock, hDevice->get())); - sycl::detail::ur::assertion(max_registers >= 0); + sycl::detail::ur::assertion(MaxRegisters >= 0); - return ReturnValue(static_cast(max_registers)); + return ReturnValue(static_cast(MaxRegisters)); } case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: return ReturnValue(false); @@ -848,8 +846,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device, /// \return UR_RESULT_SUCCESS if the function is executed successfully /// HIP devices are always root devices so retain always returns success. -UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t device) { - UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t hDevice) { + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); return UR_RESULT_SUCCESS; } @@ -862,8 +860,9 @@ urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *, /// \return UR_RESULT_SUCCESS always since HIP devices are always root /// devices. -UR_DLLEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t device) { - UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); +UR_DLLEXPORT ur_result_t UR_APICALL +urDeviceRelease(ur_device_handle_t hDevice) { + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); return UR_RESULT_SUCCESS; } @@ -873,32 +872,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, uint32_t NumEntries, ur_device_handle_t *phDevices, uint32_t *pNumDevices) { - ur_result_t err = UR_RESULT_SUCCESS; - const bool askingForDefault = DeviceType == UR_DEVICE_TYPE_DEFAULT; - const bool askingForGPU = DeviceType == UR_DEVICE_TYPE_GPU; - const bool askingForAll = DeviceType == UR_DEVICE_TYPE_ALL; - const bool returnDevices = askingForDefault || askingForGPU || askingForAll; + ur_result_t Err = UR_RESULT_SUCCESS; + const bool AskingForDefault = DeviceType == UR_DEVICE_TYPE_DEFAULT; + const bool AskingForGPU = DeviceType == UR_DEVICE_TYPE_GPU; + const bool AskingForAll = DeviceType == UR_DEVICE_TYPE_ALL; + const bool ReturnDevices = AskingForDefault || AskingForGPU || AskingForAll; UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - size_t numDevices = returnDevices ? hPlatform->devices_.size() : 0; + size_t NumDevices = ReturnDevices ? hPlatform->Devices.size() : 0; try { UR_ASSERT(pNumDevices || phDevices, UR_RESULT_ERROR_INVALID_VALUE); if (pNumDevices) { - *pNumDevices = numDevices; + *pNumDevices = NumDevices; } - if (returnDevices && phDevices) { - for (size_t i = 0; i < std::min(size_t(NumEntries), numDevices); ++i) { - phDevices[i] = hPlatform->devices_[i].get(); + if (ReturnDevices && phDevices) { + for (size_t i = 0; i < std::min(size_t(NumEntries), NumDevices); ++i) { + phDevices[i] = hPlatform->Devices[i].get(); } } - return err; - } catch (ur_result_t err) { - return err; + return Err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } @@ -906,11 +905,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, /// Gets the native HIP handle of a UR device object /// -/// \param[in] device The UR device to get the native HIP object of. -/// \param[out] nativeHandle Set to the native handle of the UR device object. +/// \param[in] hDevice The UR device to get the native HIP object of. +/// \param[out] phNativeHandle Set to the native handle of the UR device object. /// /// \return UR_RESULT_SUCCESS - UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( ur_device_handle_t hDevice, ur_native_handle_t *phNativeHandle) { UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); @@ -924,19 +922,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, const ur_device_native_properties_t *pProperties, ur_device_handle_t *phDevice) { + std::ignore = hPlatform; + std::ignore = pProperties; + UR_ASSERT(hNativeDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER); return UR_RESULT_ERROR_INVALID_OPERATION; } -/// \return If available, the first binary that is PTX +/// \return UR_RESULT_SUCCESS If available, the first binary that is PTX /// UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries, uint32_t NumBinaries, uint32_t *pSelectedBinary) { // Ignore unused parameter - (void)hDevice; + std::ignore = hDevice; UR_ASSERT(pBinaries, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(NumBinaries > 0, UR_RESULT_ERROR_INVALID_ARGUMENT); @@ -944,14 +945,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( // Look for an image for the HIP target, and return the first one that is // found #if defined(__HIP_PLATFORM_AMD__) - const char *binary_type = UR_DEVICE_BINARY_TARGET_AMDGCN; + const char *BinaryType = UR_DEVICE_BINARY_TARGET_AMDGCN; #elif defined(__HIP_PLATFORM_NVIDIA__) - const char *binary_type = UR_DEVICE_BINARY_TARGET_NVPTX64; + const char *BinaryType = UR_DEVICE_BINARY_TARGET_NVPTX64; #else #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); #endif for (uint32_t i = 0; i < NumBinaries; i++) { - if (strcmp(pBinaries[i].pDeviceTargetSpec, binary_type) == 0) { + if (strcmp(pBinaries[i].pDeviceTargetSpec, BinaryType) == 0) { *pSelectedBinary = i; return UR_RESULT_SUCCESS; } @@ -969,12 +970,12 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, if (!pDeviceTimestamp && !pHostTimestamp) return UR_RESULT_SUCCESS; - ur_event_handle_t_::native_type event; - ScopedContext active(hDevice->get_context()); + ur_event_handle_t_::native_type Event; + ScopedContext Active(hDevice->getContext()); if (pDeviceTimestamp) { - UR_CHECK_ERROR(hipEventCreateWithFlags(&event, hipEventDefault)); - UR_CHECK_ERROR(hipEventRecord(event)); + UR_CHECK_ERROR(hipEventCreateWithFlags(&Event, hipEventDefault)); + UR_CHECK_ERROR(hipEventRecord(Event)); } if (pHostTimestamp) { using namespace std::chrono; @@ -984,11 +985,11 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, } if (pDeviceTimestamp) { - UR_CHECK_ERROR(hipEventSynchronize(event)); - float elapsedTime = 0.0f; - UR_CHECK_ERROR(hipEventElapsedTime(&elapsedTime, - ur_platform_handle_t_::evBase_, event)); - *pDeviceTimestamp = (uint64_t)(elapsedTime * (double)1e6); + UR_CHECK_ERROR(hipEventSynchronize(Event)); + float ElapsedTime = 0.0f; + UR_CHECK_ERROR(hipEventElapsedTime(&ElapsedTime, + ur_platform_handle_t_::EvBase, Event)); + *pDeviceTimestamp = (uint64_t)(ElapsedTime * (double)1e6); } return UR_RESULT_SUCCESS; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp index 578e003223d4c..370aaee5424b2 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp @@ -20,24 +20,24 @@ struct ur_device_handle_t_ { private: using native_type = hipDevice_t; - native_type hipDevice_; - std::atomic_uint32_t refCount_; - ur_platform_handle_t platform_; - ur_context_handle_t context_; + native_type HIPDevice; + std::atomic_uint32_t RefCount; + ur_platform_handle_t Platform; + ur_context_handle_t Context; public: - ur_device_handle_t_(native_type hipDevice, ur_platform_handle_t platform) - : hipDevice_(hipDevice), refCount_{1}, platform_(platform) {} + ur_device_handle_t_(native_type HipDevice, ur_platform_handle_t Platform) + : HIPDevice(HipDevice), RefCount{1}, Platform(Platform) {} - native_type get() const noexcept { return hipDevice_; }; + native_type get() const noexcept { return HIPDevice; }; - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } - ur_platform_handle_t get_platform() const noexcept { return platform_; }; + ur_platform_handle_t getPlatform() const noexcept { return Platform; }; - void set_context(ur_context_handle_t ctx) { context_ = ctx; }; + void setContext(ur_context_handle_t Ctxt) { Context = Ctxt; }; - ur_context_handle_t get_context() { return context_; }; + ur_context_handle_t getContext() { return Context; }; }; -int getAttribute(ur_device_handle_t device, hipDeviceAttribute_t attribute); +int getAttribute(ur_device_handle_t Device, hipDeviceAttribute_t Attribute); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp index a2be12c20c9ab..6cf7e169d364b 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp @@ -15,8 +15,8 @@ namespace { -static size_t imageElementByteSize(hipArray_Format array_format) { - switch (array_format) { +static size_t imageElementByteSize(hipArray_Format ArrayFormat) { + switch (ArrayFormat) { case HIP_AD_FORMAT_UNSIGNED_INT8: case HIP_AD_FORMAT_SIGNED_INT8: return 1; @@ -34,58 +34,57 @@ static size_t imageElementByteSize(hipArray_Format array_format) { return 0; } -ur_result_t enqueueEventsWait(ur_queue_handle_t command_queue, - hipStream_t stream, - uint32_t num_events_in_wait_list, - const ur_event_handle_t *event_wait_list) { - if (!event_wait_list) { +ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, + hipStream_t Stream, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList) { + if (!EventWaitList) { return UR_RESULT_SUCCESS; } try { - ScopedContext active(command_queue->get_context()); + ScopedContext Active(CommandQueue->getContext()); - auto result = forLatestEvents( - event_wait_list, num_events_in_wait_list, - [stream](ur_event_handle_t event) -> ur_result_t { - if (event->get_stream() == stream) { + auto Result = forLatestEvents( + EventWaitList, NumEventsInWaitList, + [Stream](ur_event_handle_t Event) -> ur_result_t { + if (Event->getStream() == Stream) { return UR_RESULT_SUCCESS; } else { - return UR_CHECK_ERROR(hipStreamWaitEvent(stream, event->get(), 0)); + return UR_CHECK_ERROR(hipStreamWaitEvent(Stream, Event->get(), 0)); } }); - if (result != UR_RESULT_SUCCESS) { - return result; + if (Result != UR_RESULT_SUCCESS) { + return Result; } return UR_RESULT_SUCCESS; - } catch (ur_result_t err) { - return err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } } -void simpleGuessLocalWorkSize(size_t *threadsPerBlock, - const size_t *global_work_size, - const size_t maxThreadsPerBlock[3], - ur_kernel_handle_t kernel) { - assert(threadsPerBlock != nullptr); - assert(global_work_size != nullptr); - assert(kernel != nullptr); +void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock, + const size_t *GlobalWorkSize, + const size_t MaxThreadsPerBlock[3], + ur_kernel_handle_t Kernel) { + assert(ThreadsPerBlock != nullptr); + assert(GlobalWorkSize != nullptr); + assert(Kernel != nullptr); // int recommendedBlockSize, minGrid; // UR_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize( - // &minGrid, &recommendedBlockSize, kernel->get(), + // &minGrid, &recommendedBlockSize, Kernel->get(), // 0, 0)); //(void)minGrid; // Not used, avoid warnings - threadsPerBlock[0] = std::min(maxThreadsPerBlock[0], global_work_size[0]); + ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]); // Find a local work group size that is a divisor of the global // work group size to produce uniform work groups. - while (0u != (global_work_size[0] % threadsPerBlock[0])) { - --threadsPerBlock[0]; + while (0u != (GlobalWorkSize[0] % ThreadsPerBlock[0])) { + --ThreadsPerBlock[0]; } } } // namespace @@ -103,41 +102,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - ur_result_t retErr = UR_RESULT_SUCCESS; - std::unique_ptr retImplEv{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); - hipStream_t hipStream = hQueue->get_next_transfer_stream(); - retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + hipStream_t HIPStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_WRITE, hQueue, hipStream)); - retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_WRITE, hQueue, HIPStream)); + RetImplEvent->start(); } - retErr = UR_CHECK_ERROR( - hipMemcpyHtoDAsync(hBuffer->mem_.buffer_mem_.get_with_offset(offset), - const_cast(pSrc), size, hipStream)); + Result = UR_CHECK_ERROR( + hipMemcpyHtoDAsync(hBuffer->Mem.BufferMem.getWithOffset(offset), + const_cast(pSrc), size, HIPStream)); if (phEvent) { - retErr = retImplEv->record(); + Result = RetImplEvent->record(); } if (blockingWrite) { - retErr = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + Result = UR_CHECK_ERROR(hipStreamSynchronize(HIPStream)); } if (phEvent) { - *phEvent = retImplEv.release(); + *phEvent = RetImplEvent.release(); } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( @@ -153,42 +152,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - ur_result_t retErr = UR_RESULT_SUCCESS; - std::unique_ptr retImplEv{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); - hipStream_t hipStream = hQueue->get_next_transfer_stream(); - retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + hipStream_t HIPStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_READ, hQueue, hipStream)); - retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_READ, hQueue, HIPStream)); + RetImplEvent->start(); } - retErr = UR_CHECK_ERROR(hipMemcpyDtoHAsync( - pDst, hBuffer->mem_.buffer_mem_.get_with_offset(offset), size, - hipStream)); + Result = UR_CHECK_ERROR(hipMemcpyDtoHAsync( + pDst, hBuffer->Mem.BufferMem.getWithOffset(offset), size, HIPStream)); if (phEvent) { - retErr = retImplEv->record(); + Result = RetImplEvent->record(); } if (blockingRead) { - retErr = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + Result = UR_CHECK_ERROR(hipStreamSynchronize(HIPStream)); } if (phEvent) { - *phEvent = retImplEv.release(); + *phEvent = RetImplEvent.release(); } } catch (ur_result_t err) { - retErr = err; + Result = err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( @@ -201,7 +199,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pGlobalWorkOffset, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pGlobalWorkSize, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(hQueue->get_context() == hKernel->get_context(), + UR_ASSERT(hQueue->getContext() == hKernel->getContext(), UR_RESULT_ERROR_INVALID_QUEUE); UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); @@ -213,28 +211,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Set the number of threads per block to the number of threads per warp // by default unless user has provided a better number - size_t threadsPerBlock[3] = {32u, 1u, 1u}; - size_t maxWorkGroupSize = 0u; - size_t maxThreadsPerBlock[3] = {}; - bool providedLocalWorkGroupSize = (pLocalWorkSize != nullptr); + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + size_t MaxWorkGroupSize = 0u; + size_t MaxThreadsPerBlock[3] = {}; + bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr); { - ur_result_t retError = urDeviceGetInfo( - hQueue->device_, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES, - sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr); - UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); + ur_result_t Result = urDeviceGetInfo( + hQueue->Device, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES, + sizeof(MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr); + UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); - retError = - urDeviceGetInfo(hQueue->device_, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE, - sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr); - UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); + Result = + urDeviceGetInfo(hQueue->Device, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE, + sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr); + UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); - // The maxWorkGroupsSize = 1024 for AMD GPU - // The maxThreadsPerBlock = {1024, 1024, 1024} + // The MaxWorkGroupSize = 1024 for AMD GPU + // The MaxThreadsPerBlock = {1024, 1024, 1024} - if (providedLocalWorkGroupSize) { + if (ProvidedLocalWorkGroupSize) { auto isValid = [&](int dim) { - UR_ASSERT(pLocalWorkSize[dim] <= maxThreadsPerBlock[dim], + UR_ASSERT(pLocalWorkSize[dim] <= MaxThreadsPerBlock[dim], UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); // Checks that local work sizes are a divisor of the global work sizes // which includes that the local work sizes are neither larger than the @@ -242,7 +240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_ASSERT(pLocalWorkSize != 0, UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); UR_ASSERT((pGlobalWorkSize[dim] % pLocalWorkSize[dim]) == 0, UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); - threadsPerBlock[dim] = pLocalWorkSize[dim]; + ThreadsPerBlock[dim] = pLocalWorkSize[dim]; return UR_RESULT_SUCCESS; }; @@ -252,60 +250,60 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return err; } } else { - simpleGuessLocalWorkSize(threadsPerBlock, pGlobalWorkSize, - maxThreadsPerBlock, hKernel); + simpleGuessLocalWorkSize(ThreadsPerBlock, pGlobalWorkSize, + MaxThreadsPerBlock, hKernel); } } - UR_ASSERT(maxWorkGroupSize >= size_t(threadsPerBlock[0] * threadsPerBlock[1] * - threadsPerBlock[2]), + UR_ASSERT(MaxWorkGroupSize >= size_t(ThreadsPerBlock[0] * ThreadsPerBlock[1] * + ThreadsPerBlock[2]), UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); - size_t blocksPerGrid[3] = {1u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; for (size_t i = 0; i < workDim; i++) { - blocksPerGrid[i] = - (pGlobalWorkSize[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i]; + BlocksPerGrid[i] = + (pGlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i]; } - ur_result_t retError = UR_RESULT_SUCCESS; - std::unique_ptr retImplEv{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); - uint32_t stream_token; - ur_stream_quard guard; - hipStream_t hipStream = hQueue->get_next_compute_stream( - numEventsInWaitList, phEventWaitList, guard, &stream_token); - hipFunction_t hipFunc = hKernel->get(); + uint32_t StreamToken; + ur_stream_quard Guard; + hipStream_t HIPStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + hipFunction_t HIPFunc = hKernel->get(); - retError = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, - phEventWaitList); + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList); // Set the implicit global offset parameter if kernel has offset variant - if (hKernel->get_with_offset_parameter()) { + if (hKernel->getWithOffsetParameter()) { std::uint32_t hip_implicit_offset[3] = {0, 0, 0}; if (pGlobalWorkOffset) { for (size_t i = 0; i < workDim; i++) { hip_implicit_offset[i] = static_cast(pGlobalWorkOffset[i]); if (pGlobalWorkOffset[i] != 0) { - hipFunc = hKernel->get_with_offset_parameter(); + HIPFunc = hKernel->getWithOffsetParameter(); } } } - hKernel->set_implicit_offset_arg(sizeof(hip_implicit_offset), - hip_implicit_offset); + hKernel->setImplicitOffsetArg(sizeof(hip_implicit_offset), + hip_implicit_offset); } - auto argIndices = hKernel->get_arg_indices(); + auto ArgIndices = hKernel->getArgIndices(); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_KERNEL_LAUNCH, hQueue, hipStream, stream_token)); - retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_KERNEL_LAUNCH, hQueue, HIPStream, StreamToken)); + RetImplEvent->start(); } // Set local mem max size if env var is present @@ -314,9 +312,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( if (local_mem_sz_ptr) { int device_max_local_mem = 0; - retError = UR_CHECK_ERROR(hipDeviceGetAttribute( + Result = UR_CHECK_ERROR(hipDeviceGetAttribute( &device_max_local_mem, hipDeviceAttributeMaxSharedMemoryPerBlock, - hQueue->get_device()->get())); + hQueue->getDevice()->get())); static const int env_val = std::atoi(local_mem_sz_ptr); if (env_val <= 0 || env_val > device_max_local_mem) { @@ -325,25 +323,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_RESULT_ERROR_ADAPTER_SPECIFIC); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } - retError = UR_CHECK_ERROR(hipFuncSetAttribute( - hipFunc, hipFuncAttributeMaxDynamicSharedMemorySize, env_val)); + Result = UR_CHECK_ERROR(hipFuncSetAttribute( + HIPFunc, hipFuncAttributeMaxDynamicSharedMemorySize, env_val)); } - retError = UR_CHECK_ERROR(hipModuleLaunchKernel( - hipFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2], - threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], - hKernel->get_local_size(), hipStream, argIndices.data(), nullptr)); + Result = UR_CHECK_ERROR(hipModuleLaunchKernel( + HIPFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], + ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], + hKernel->getLocalSize(), HIPStream, ArgIndices.data(), nullptr)); - hKernel->clear_local_size(); + hKernel->clearLocalSize(); if (phEvent) { - retError = retImplEv->record(); - *phEvent = retImplEv.release(); + Result = RetImplEvent->record(); + *phEvent = RetImplEvent.release(); } } catch (ur_result_t err) { - retError = err; + Result = err; } - return retError; + return Result; } /// Enqueues a wait on the given queue for all events. @@ -374,72 +372,69 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST) - ur_result_t result; + ur_result_t Result; try { - ScopedContext active(hQueue->get_context()); - uint32_t stream_token; - ur_stream_quard guard; - hipStream_t hipStream = hQueue->get_next_compute_stream( + ScopedContext Active(hQueue->getContext()); + uint32_t StreamToken; + ur_stream_quard Guard; + hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, - reinterpret_cast(phEventWaitList), guard, - &stream_token); + reinterpret_cast(phEventWaitList), Guard, + &StreamToken); { - std::lock_guard guard(hQueue->barrier_mutex_); - if (hQueue->barrier_event_ == nullptr) { - UR_CHECK_ERROR(hipEventCreate(&hQueue->barrier_event_)); + std::lock_guard Guard(hQueue->BarrierMutex); + if (hQueue->BarrierEvent == nullptr) { + UR_CHECK_ERROR(hipEventCreate(&hQueue->BarrierEvent)); } if (numEventsInWaitList == 0) { // wait on all work - if (hQueue->barrier_tmp_event_ == nullptr) { - UR_CHECK_ERROR(hipEventCreate(&hQueue->barrier_tmp_event_)); + if (hQueue->BarrierTmpEvent == nullptr) { + UR_CHECK_ERROR(hipEventCreate(&hQueue->BarrierTmpEvent)); } - hQueue->sync_streams( - [hipStream, tmp_event = hQueue->barrier_tmp_event_](hipStream_t s) { - if (hipStream != s) { - UR_CHECK_ERROR(hipEventRecord(tmp_event, s)); - UR_CHECK_ERROR(hipStreamWaitEvent(hipStream, tmp_event, 0)); + hQueue->syncStreams( + [HIPStream, TmpEvent = hQueue->BarrierTmpEvent](hipStream_t S) { + if (HIPStream != S) { + UR_CHECK_ERROR(hipEventRecord(TmpEvent, S)); + UR_CHECK_ERROR(hipStreamWaitEvent(HIPStream, TmpEvent, 0)); } }); } else { // wait just on given events forLatestEvents( reinterpret_cast(phEventWaitList), numEventsInWaitList, - [hipStream](ur_event_handle_t event) -> ur_result_t { - if (event->get_queue()->has_been_synchronized( - event->get_compute_stream_token())) { + [HIPStream](ur_event_handle_t Event) -> ur_result_t { + if (Event->getQueue()->hasBeenSynchronized( + Event->getComputeStreamToken())) { return UR_RESULT_SUCCESS; } else { return UR_CHECK_ERROR( - hipStreamWaitEvent(hipStream, event->get(), 0)); + hipStreamWaitEvent(HIPStream, Event->get(), 0)); } }); } - result = - UR_CHECK_ERROR(hipEventRecord(hQueue->barrier_event_, hipStream)); - for (unsigned int i = 0; i < hQueue->compute_applied_barrier_.size(); - i++) { - hQueue->compute_applied_barrier_[i] = false; + Result = UR_CHECK_ERROR(hipEventRecord(hQueue->BarrierEvent, HIPStream)); + for (unsigned int i = 0; i < hQueue->ComputeAppliedBarrier.size(); i++) { + hQueue->ComputeAppliedBarrier[i] = false; } - for (unsigned int i = 0; i < hQueue->transfer_applied_barrier_.size(); - i++) { - hQueue->transfer_applied_barrier_[i] = false; + for (unsigned int i = 0; i < hQueue->TransferAppliedBarrier.size(); i++) { + hQueue->TransferAppliedBarrier[i] = false; } } - if (result != UR_RESULT_SUCCESS) { - return result; + if (Result != UR_RESULT_SUCCESS) { + return Result; } if (phEvent) { - *phEvent = ur_event_handle_t_::make_native( - UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, hipStream, stream_token); + *phEvent = ur_event_handle_t_::makeNative( + UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, HIPStream, StreamToken); (*phEvent)->start(); (*phEvent)->record(); } return UR_RESULT_SUCCESS; - } catch (ur_result_t err) { - return err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } @@ -448,54 +443,53 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( /// General 3D memory copy operation. /// This function requires the corresponding HIP context to be at the top of /// the context stack -/// If the source and/or destination is on the device, src_ptr and/or dst_ptr +/// If the source and/or destination is on the device, SrcPtr and/or DstPtr /// must be a pointer to a hipDevPtr static ur_result_t commonEnqueueMemBufferCopyRect( - hipStream_t hip_stream, ur_rect_region_t region, const void *src_ptr, - const hipMemoryType src_type, ur_rect_offset_t src_offset, - size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr, - const hipMemoryType dst_type, ur_rect_offset_t dst_offset, - size_t dst_row_pitch, size_t dst_slice_pitch) { - - assert(src_type == hipMemoryTypeDevice || src_type == hipMemoryTypeHost); - assert(dst_type == hipMemoryTypeDevice || dst_type == hipMemoryTypeHost); - - src_row_pitch = (!src_row_pitch) ? region.width : src_row_pitch; - src_slice_pitch = - (!src_slice_pitch) ? (region.height * src_row_pitch) : src_slice_pitch; - dst_row_pitch = (!dst_row_pitch) ? region.width : dst_row_pitch; - dst_slice_pitch = - (!dst_slice_pitch) ? (region.height * dst_row_pitch) : dst_slice_pitch; - - HIP_MEMCPY3D params; - - params.WidthInBytes = region.width; - params.Height = region.height; - params.Depth = region.depth; - - params.srcMemoryType = src_type; - params.srcDevice = src_type == hipMemoryTypeDevice - ? *static_cast(src_ptr) + hipStream_t HipStream, ur_rect_region_t Region, const void *SrcPtr, + const hipMemoryType SrcType, ur_rect_offset_t SrcOffset, size_t SrcRowPitch, + size_t SrcSlicePitch, void *DstPtr, const hipMemoryType DstType, + ur_rect_offset_t DstOffset, size_t DstRowPitch, size_t DstSlicePitch) { + + assert(SrcType == hipMemoryTypeDevice || SrcType == hipMemoryTypeHost); + assert(DstType == hipMemoryTypeDevice || DstType == hipMemoryTypeHost); + + SrcRowPitch = (!SrcRowPitch) ? Region.width : SrcRowPitch; + SrcSlicePitch = + (!SrcSlicePitch) ? (Region.height * SrcRowPitch) : SrcSlicePitch; + DstRowPitch = (!DstRowPitch) ? Region.width : DstRowPitch; + DstSlicePitch = + (!DstSlicePitch) ? (Region.height * DstRowPitch) : DstSlicePitch; + + HIP_MEMCPY3D Params; + + Params.WidthInBytes = Region.width; + Params.Height = Region.height; + Params.Depth = Region.depth; + + Params.srcMemoryType = SrcType; + Params.srcDevice = SrcType == hipMemoryTypeDevice + ? *static_cast(SrcPtr) : 0; - params.srcHost = src_type == hipMemoryTypeHost ? src_ptr : nullptr; - params.srcXInBytes = src_offset.x; - params.srcY = src_offset.y; - params.srcZ = src_offset.z; - params.srcPitch = src_row_pitch; - params.srcHeight = src_slice_pitch / src_row_pitch; - - params.dstMemoryType = dst_type; - params.dstDevice = dst_type == hipMemoryTypeDevice - ? *reinterpret_cast(dst_ptr) + Params.srcHost = SrcType == hipMemoryTypeHost ? SrcPtr : nullptr; + Params.srcXInBytes = SrcOffset.x; + Params.srcY = SrcOffset.y; + Params.srcZ = SrcOffset.z; + Params.srcPitch = SrcRowPitch; + Params.srcHeight = SrcSlicePitch / SrcRowPitch; + + Params.dstMemoryType = DstType; + Params.dstDevice = DstType == hipMemoryTypeDevice + ? *reinterpret_cast(DstPtr) : 0; - params.dstHost = dst_type == hipMemoryTypeHost ? dst_ptr : nullptr; - params.dstXInBytes = dst_offset.x; - params.dstY = dst_offset.y; - params.dstZ = dst_offset.z; - params.dstPitch = dst_row_pitch; - params.dstHeight = dst_slice_pitch / dst_row_pitch; - - return UR_CHECK_ERROR(hipDrvMemcpy3DAsync(¶ms, hip_stream)); + Params.dstHost = DstType == hipMemoryTypeHost ? DstPtr : nullptr; + Params.dstXInBytes = DstOffset.x; + Params.dstY = DstOffset.y; + Params.dstZ = DstOffset.z; + Params.dstPitch = DstRowPitch; + Params.dstHeight = DstSlicePitch / DstRowPitch; + + return UR_CHECK_ERROR(hipDrvMemcpy3DAsync(&Params, HipStream)); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( @@ -530,45 +524,45 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( UR_ASSERT(!(hostSlicePitch != 0 && hostSlicePitch % hostRowPitch != 0), UR_RESULT_ERROR_INVALID_SIZE); - ur_result_t retErr = UR_RESULT_SUCCESS; - void *devPtr = hBuffer->mem_.buffer_mem_.get_void(); - std::unique_ptr retImplEv{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + void *DevPtr = hBuffer->Mem.BufferMem.getVoid(); + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); - hipStream_t hipStream = hQueue->get_next_transfer_stream(); + ScopedContext Active(hQueue->getContext()); + hipStream_t HIPStream = hQueue->getNextTransferStream(); - retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, hipStream)); - retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, HIPStream)); + RetImplEvent->start(); } - retErr = commonEnqueueMemBufferCopyRect( - hipStream, region, &devPtr, hipMemoryTypeDevice, bufferOrigin, + Result = commonEnqueueMemBufferCopyRect( + HIPStream, region, &DevPtr, hipMemoryTypeDevice, bufferOrigin, bufferRowPitch, bufferSlicePitch, pDst, hipMemoryTypeHost, hostOrigin, hostRowPitch, hostSlicePitch); if (phEvent) { - retErr = retImplEv->record(); + Result = RetImplEvent->record(); } if (blockingRead) { - retErr = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + Result = UR_CHECK_ERROR(hipStreamSynchronize(HIPStream)); } if (phEvent) { - *phEvent = retImplEv.release(); + *phEvent = RetImplEvent.release(); } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( @@ -582,44 +576,44 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t retErr = UR_RESULT_SUCCESS; - void *devPtr = hBuffer->mem_.buffer_mem_.get_void(); - std::unique_ptr retImplEv{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + void *DevPtr = hBuffer->Mem.BufferMem.getVoid(); + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); - hipStream_t hipStream = hQueue->get_next_transfer_stream(); - retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + hipStream_t HIPStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, hipStream)); - retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, HIPStream)); + RetImplEvent->start(); } - retErr = commonEnqueueMemBufferCopyRect( - hipStream, region, pSrc, hipMemoryTypeHost, hostOrigin, hostRowPitch, - hostSlicePitch, &devPtr, hipMemoryTypeDevice, bufferOrigin, + Result = commonEnqueueMemBufferCopyRect( + HIPStream, region, pSrc, hipMemoryTypeHost, hostOrigin, hostRowPitch, + hostSlicePitch, &DevPtr, hipMemoryTypeDevice, bufferOrigin, bufferRowPitch, bufferSlicePitch); if (phEvent) { - retErr = retImplEv->record(); + Result = RetImplEvent->record(); } if (blockingWrite) { - retErr = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + Result = UR_CHECK_ERROR(hipStreamSynchronize(HIPStream)); } if (phEvent) { - *phEvent = retImplEv.release(); + *phEvent = RetImplEvent.release(); } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( @@ -629,38 +623,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( ur_event_handle_t *phEvent) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - std::unique_ptr retImplEv{nullptr}; + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); - ur_result_t result; - auto stream = hQueue->get_next_transfer_stream(); + ScopedContext Active(hQueue->getContext()); + ur_result_t Result; + auto Stream = hQueue->getNextTransferStream(); if (phEventWaitList) { - result = enqueueEventsWait(hQueue, stream, numEventsInWaitList, + Result = enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); } if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_COPY, hQueue, stream)); - result = retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_COPY, hQueue, Stream)); + Result = RetImplEvent->start(); } - auto src = hBufferSrc->mem_.buffer_mem_.get_with_offset(srcOffset); - auto dst = hBufferDst->mem_.buffer_mem_.get_with_offset(dstOffset); + auto Src = hBufferSrc->Mem.BufferMem.getWithOffset(srcOffset); + auto Dst = hBufferDst->Mem.BufferMem.getWithOffset(dstOffset); - result = UR_CHECK_ERROR(hipMemcpyDtoDAsync(dst, src, size, stream)); + Result = UR_CHECK_ERROR(hipMemcpyDtoDAsync(Dst, Src, size, Stream)); if (phEvent) { - result = retImplEv->record(); - *phEvent = retImplEv.release(); + Result = RetImplEvent->record(); + *phEvent = RetImplEvent.release(); } - return result; - } catch (ur_result_t err) { - return err; + return Result; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } @@ -678,38 +672,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( UR_ASSERT(hBufferSrc, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hBufferDst, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t retErr = UR_RESULT_SUCCESS; - void *srcPtr = hBufferSrc->mem_.buffer_mem_.get_void(); - void *dstPtr = hBufferDst->mem_.buffer_mem_.get_void(); - std::unique_ptr retImplEv{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + void *SrcPtr = hBufferSrc->Mem.BufferMem.getVoid(); + void *DstPtr = hBufferDst->Mem.BufferMem.getVoid(); + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); - hipStream_t hipStream = hQueue->get_next_transfer_stream(); - retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + hipStream_t HIPStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, hipStream)); - retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, HIPStream)); + RetImplEvent->start(); } - retErr = commonEnqueueMemBufferCopyRect( - hipStream, region, &srcPtr, hipMemoryTypeDevice, srcOrigin, srcRowPitch, - srcSlicePitch, &dstPtr, hipMemoryTypeDevice, dstOrigin, dstRowPitch, + Result = commonEnqueueMemBufferCopyRect( + HIPStream, region, &SrcPtr, hipMemoryTypeDevice, srcOrigin, srcRowPitch, + srcSlicePitch, &DstPtr, hipMemoryTypeDevice, dstOrigin, dstRowPitch, dstSlicePitch); if (phEvent) { - retImplEv->record(); - *phEvent = retImplEv.release(); + RetImplEvent->record(); + *phEvent = RetImplEvent.release(); } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( @@ -722,59 +716,59 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pPattern, UR_RESULT_ERROR_INVALID_NULL_POINTER); - auto args_are_multiples_of_pattern_size = + auto ArgsAreMultiplesOfPatternSize = (offset % patternSize == 0) || (size % patternSize == 0); - auto pattern_is_valid = (pPattern != nullptr); + auto PatternIsValid = (pPattern != nullptr); - auto pattern_size_is_valid = + auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) && // is power of two (patternSize > 0) && (patternSize <= 128); // falls within valid range - UR_ASSERT(args_are_multiples_of_pattern_size && pattern_is_valid && - pattern_size_is_valid, + UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid && + PatternSizeIsValid, UR_RESULT_ERROR_INVALID_VALUE); - (void)args_are_multiples_of_pattern_size; - (void)pattern_is_valid; - (void)pattern_size_is_valid; + std::ignore = ArgsAreMultiplesOfPatternSize; + std::ignore = PatternIsValid; + std::ignore = PatternSizeIsValid; - std::unique_ptr retImplEv{nullptr}; + std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); - auto stream = hQueue->get_next_transfer_stream(); - ur_result_t result; + auto Stream = hQueue->getNextTransferStream(); + ur_result_t Result; if (phEventWaitList) { - result = enqueueEventsWait(hQueue, stream, numEventsInWaitList, + Result = enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); } if (phEvent) { - retImplEv = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_MEM_BUFFER_FILL, hQueue, stream)); - result = retImplEv->start(); + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream)); + Result = RetImplEvent->start(); } - auto dstDevice = hBuffer->mem_.buffer_mem_.get_with_offset(offset); + auto DstDevice = hBuffer->Mem.BufferMem.getWithOffset(offset); auto N = size / patternSize; // pattern size in bytes switch (patternSize) { case 1: { - auto value = *static_cast(pPattern); - result = UR_CHECK_ERROR(hipMemsetD8Async(dstDevice, value, N, stream)); + auto Value = *static_cast(pPattern); + Result = UR_CHECK_ERROR(hipMemsetD8Async(DstDevice, Value, N, Stream)); break; } case 2: { - auto value = *static_cast(pPattern); - result = UR_CHECK_ERROR(hipMemsetD16Async(dstDevice, value, N, stream)); + auto Value = *static_cast(pPattern); + Result = UR_CHECK_ERROR(hipMemsetD16Async(DstDevice, Value, N, Stream)); break; } case 4: { - auto value = *static_cast(pPattern); - result = UR_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, N, stream)); + auto Value = *static_cast(pPattern); + Result = UR_CHECK_ERROR(hipMemsetD32Async(DstDevice, Value, N, Stream)); break; } @@ -790,39 +784,39 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( // Calculate the number of patterns, stride, number of times the pattern // needs to be applied, and the number of times the first 32 bit pattern // needs to be applied. - auto number_of_steps = patternSize / sizeof(uint8_t); - auto pitch = number_of_steps * sizeof(uint8_t); - auto height = size / number_of_steps; - auto count_32 = size / sizeof(uint32_t); + auto NumberOfSteps = patternSize / sizeof(uint8_t); + auto Pitch = NumberOfSteps * sizeof(uint8_t); + auto Height = size / NumberOfSteps; + auto Count32 = size / sizeof(uint32_t); // Get 4-byte chunk of the pattern and call hipMemsetD32Async - auto value = *(static_cast(pPattern)); - result = - UR_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, count_32, stream)); - for (auto step = 4u; step < number_of_steps; ++step) { + auto Value = *(static_cast(pPattern)); + Result = + UR_CHECK_ERROR(hipMemsetD32Async(DstDevice, Value, Count32, Stream)); + for (auto step = 4u; step < NumberOfSteps; ++step) { // take 1 byte of the pattern - value = *(static_cast(pPattern) + step); + Value = *(static_cast(pPattern) + step); // offset the pointer to the part of the buffer we want to write to - auto offset_ptr = reinterpret_cast( - reinterpret_cast(dstDevice) + (step * sizeof(uint8_t))); + auto OffsetPtr = reinterpret_cast( + reinterpret_cast(DstDevice) + (step * sizeof(uint8_t))); // set all of the pattern chunks - result = UR_CHECK_ERROR(hipMemset2DAsync( - offset_ptr, pitch, value, sizeof(uint8_t), height, stream)); + Result = UR_CHECK_ERROR(hipMemset2DAsync( + OffsetPtr, Pitch, Value, sizeof(uint8_t), Height, Stream)); } break; } } if (phEvent) { - result = retImplEv->record(); - *phEvent = retImplEv.release(); + Result = RetImplEvent->record(); + *phEvent = RetImplEvent.release(); } - return result; - } catch (ur_result_t err) { - return err; + return Result; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } @@ -831,72 +825,72 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( /// General ND memory copy operation for images (where N > 1). /// This function requires the corresponding HIP context to be at the top of /// the context stack -/// If the source and/or destination is an array, src_ptr and/or dst_ptr +/// If the source and/or destination is an array, SrcPtr and/or DstPtr /// must be a pointer to a hipArray static ur_result_t commonEnqueueMemImageNDCopy( - hipStream_t hip_stream, ur_mem_type_t img_type, const size_t *region, - const void *src_ptr, const hipMemoryType src_type, const size_t *src_offset, - void *dst_ptr, const hipMemoryType dst_type, const size_t *dst_offset) { - UR_ASSERT(region, UR_RESULT_ERROR_INVALID_NULL_POINTER); + hipStream_t HipStream, ur_mem_type_t ImgType, const size_t *Region, + const void *SrcPtr, const hipMemoryType SrcType, const size_t *SrcOffset, + void *DstPtr, const hipMemoryType DstType, const size_t *DstOffset) { + UR_ASSERT(Region, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(src_type == hipMemoryTypeArray || src_type == hipMemoryTypeHost, + UR_ASSERT(SrcType == hipMemoryTypeArray || SrcType == hipMemoryTypeHost, UR_RESULT_ERROR_INVALID_VALUE); - UR_ASSERT(dst_type == hipMemoryTypeArray || dst_type == hipMemoryTypeHost, + UR_ASSERT(DstType == hipMemoryTypeArray || DstType == hipMemoryTypeHost, UR_RESULT_ERROR_INVALID_VALUE); - if (img_type == UR_MEM_TYPE_IMAGE2D) { - hip_Memcpy2D cpyDesc; - memset(&cpyDesc, 0, sizeof(cpyDesc)); - cpyDesc.srcMemoryType = src_type; - if (src_type == hipMemoryTypeArray) { - cpyDesc.srcArray = - reinterpret_cast(const_cast(src_ptr)); - cpyDesc.srcXInBytes = src_offset[0]; - cpyDesc.srcY = src_offset[1]; + if (ImgType == UR_MEM_TYPE_IMAGE2D) { + hip_Memcpy2D CpyDesc; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + CpyDesc.srcMemoryType = SrcType; + if (SrcType == hipMemoryTypeArray) { + CpyDesc.srcArray = + reinterpret_cast(const_cast(SrcPtr)); + CpyDesc.srcXInBytes = SrcOffset[0]; + CpyDesc.srcY = SrcOffset[1]; } else { - cpyDesc.srcHost = src_ptr; - } - cpyDesc.dstMemoryType = dst_type; - if (dst_type == hipMemoryTypeArray) { - cpyDesc.dstArray = - reinterpret_cast(const_cast(dst_ptr)); - cpyDesc.dstXInBytes = dst_offset[0]; - cpyDesc.dstY = dst_offset[1]; + CpyDesc.srcHost = SrcPtr; + } + CpyDesc.dstMemoryType = DstType; + if (DstType == hipMemoryTypeArray) { + CpyDesc.dstArray = + reinterpret_cast(const_cast(DstPtr)); + CpyDesc.dstXInBytes = DstOffset[0]; + CpyDesc.dstY = DstOffset[1]; } else { - cpyDesc.dstHost = dst_ptr; + CpyDesc.dstHost = DstPtr; } - cpyDesc.WidthInBytes = region[0]; - cpyDesc.Height = region[1]; - return UR_CHECK_ERROR(hipMemcpyParam2DAsync(&cpyDesc, hip_stream)); + CpyDesc.WidthInBytes = Region[0]; + CpyDesc.Height = Region[1]; + return UR_CHECK_ERROR(hipMemcpyParam2DAsync(&CpyDesc, HipStream)); } - if (img_type == UR_MEM_TYPE_IMAGE3D) { - - HIP_MEMCPY3D cpyDesc; - memset(&cpyDesc, 0, sizeof(cpyDesc)); - cpyDesc.srcMemoryType = src_type; - if (src_type == hipMemoryTypeArray) { - cpyDesc.srcArray = - reinterpret_cast(const_cast(src_ptr)); - cpyDesc.srcXInBytes = src_offset[0]; - cpyDesc.srcY = src_offset[1]; - cpyDesc.srcZ = src_offset[2]; + if (ImgType == UR_MEM_TYPE_IMAGE3D) { + + HIP_MEMCPY3D CpyDesc; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + CpyDesc.srcMemoryType = SrcType; + if (SrcType == hipMemoryTypeArray) { + CpyDesc.srcArray = + reinterpret_cast(const_cast(SrcPtr)); + CpyDesc.srcXInBytes = SrcOffset[0]; + CpyDesc.srcY = SrcOffset[1]; + CpyDesc.srcZ = SrcOffset[2]; } else { - cpyDesc.srcHost = src_ptr; - } - cpyDesc.dstMemoryType = dst_type; - if (dst_type == hipMemoryTypeArray) { - cpyDesc.dstArray = reinterpret_cast(dst_ptr); - cpyDesc.dstXInBytes = dst_offset[0]; - cpyDesc.dstY = dst_offset[1]; - cpyDesc.dstZ = dst_offset[2]; + CpyDesc.srcHost = SrcPtr; + } + CpyDesc.dstMemoryType = DstType; + if (DstType == hipMemoryTypeArray) { + CpyDesc.dstArray = reinterpret_cast(DstPtr); + CpyDesc.dstXInBytes = DstOffset[0]; + CpyDesc.dstY = DstOffset[1]; + CpyDesc.dstZ = DstOffset[2]; } else { - cpyDesc.dstHost = dst_ptr; + CpyDesc.dstHost = DstPtr; } - cpyDesc.WidthInBytes = region[0]; - cpyDesc.Height = region[1]; - cpyDesc.Depth = region[2]; - return UR_CHECK_ERROR(hipDrvMemcpy3DAsync(&cpyDesc, hip_stream)); + CpyDesc.WidthInBytes = Region[0]; + CpyDesc.Height = Region[1]; + CpyDesc.Depth = Region[2]; + return UR_CHECK_ERROR(hipDrvMemcpy3DAsync(&CpyDesc, HipStream)); return UR_RESULT_ERROR_UNKNOWN; } @@ -913,61 +907,61 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(hImage->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hQueue->get_context()); - hipStream_t hipStream = hQueue->get_next_transfer_stream(); + ScopedContext Active(hQueue->getContext()); + hipStream_t HIPStream = hQueue->getNextTransferStream(); if (phEventWaitList) { - retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); } - hipArray *array = hImage->mem_.surface_mem_.get_array(); + hipArray *Array = hImage->Mem.SurfaceMem.getArray(); hipArray_Format Format; size_t NumChannels; - getArrayDesc(array, Format, NumChannels); + getArrayDesc(Array, Format, NumChannels); - int elementByteSize = imageElementByteSize(Format); + int ElementByteSize = imageElementByteSize(Format); - size_t byteOffsetX = origin.x * elementByteSize * NumChannels; - size_t bytesToCopy = elementByteSize * NumChannels * region.depth; + size_t ByteOffsetX = origin.x * ElementByteSize * NumChannels; + size_t BytesToCopy = ElementByteSize * NumChannels * region.depth; - auto imgType = hImage->mem_.surface_mem_.get_image_type(); + auto ImgType = hImage->Mem.SurfaceMem.getImageType(); - size_t adjustedRegion[3] = {bytesToCopy, region.height, region.height}; - size_t srcOffset[3] = {byteOffsetX, origin.y, origin.z}; + size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.height}; + size_t SrcOffset[3] = {ByteOffsetX, origin.y, origin.z}; - retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion, - array, hipMemoryTypeArray, srcOffset, + Result = commonEnqueueMemImageNDCopy(HIPStream, ImgType, AdjustedRegion, + Array, hipMemoryTypeArray, SrcOffset, pDst, hipMemoryTypeHost, nullptr); - if (retErr != UR_RESULT_SUCCESS) { - return retErr; + if (Result != UR_RESULT_SUCCESS) { + return Result; } if (phEvent) { - auto new_event = ur_event_handle_t_::make_native( - UR_COMMAND_MEM_IMAGE_READ, hQueue, hipStream); - new_event->record(); - *phEvent = new_event; + auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_READ, + hQueue, HIPStream); + NewEvent->record(); + *phEvent = NewEvent; } if (blockingRead) { - retErr = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + Result = UR_CHECK_ERROR(hipStreamSynchronize(HIPStream)); } - } catch (ur_result_t err) { - return err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } return UR_RESULT_SUCCESS; - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( @@ -981,59 +975,59 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(hImage->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hQueue->get_context()); - hipStream_t hipStream = hQueue->get_next_transfer_stream(); + ScopedContext Active(hQueue->getContext()); + hipStream_t HIPStream = hQueue->getNextTransferStream(); if (phEventWaitList) { - retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); } - hipArray *array = hImage->mem_.surface_mem_.get_array(); + hipArray *Array = hImage->Mem.SurfaceMem.getArray(); hipArray_Format Format; size_t NumChannels; - getArrayDesc(array, Format, NumChannels); + getArrayDesc(Array, Format, NumChannels); - int elementByteSize = imageElementByteSize(Format); + int ElementByteSize = imageElementByteSize(Format); - size_t byteOffsetX = origin.x * elementByteSize * NumChannels; - size_t bytesToCopy = elementByteSize * NumChannels * region.depth; + size_t ByteOffsetX = origin.x * ElementByteSize * NumChannels; + size_t BytesToCopy = ElementByteSize * NumChannels * region.depth; - auto imgType = hImage->mem_.surface_mem_.get_image_type(); + auto ImgType = hImage->Mem.SurfaceMem.getImageType(); - size_t adjustedRegion[3] = {bytesToCopy, region.height, region.height}; - size_t dstOffset[3] = {byteOffsetX, origin.y, origin.z}; + size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.height}; + size_t DstOffset[3] = {ByteOffsetX, origin.y, origin.z}; - retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion, + Result = commonEnqueueMemImageNDCopy(HIPStream, ImgType, AdjustedRegion, pSrc, hipMemoryTypeHost, nullptr, - array, hipMemoryTypeArray, dstOffset); + Array, hipMemoryTypeArray, DstOffset); - if (retErr != UR_RESULT_SUCCESS) { - return retErr; + if (Result != UR_RESULT_SUCCESS) { + return Result; } if (phEvent) { - auto new_event = ur_event_handle_t_::make_native( - UR_COMMAND_MEM_IMAGE_WRITE, hQueue, hipStream); - new_event->record(); - *phEvent = new_event; + auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_WRITE, + hQueue, HIPStream); + NewEvent->record(); + *phEvent = NewEvent; } - } catch (ur_result_t err) { - return err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } return UR_RESULT_SUCCESS; - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( @@ -1046,67 +1040,67 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE) UR_ASSERT(hImageSrc, UR_RESULT_ERROR_INVALID_NULL_HANDLE) UR_ASSERT(hImageDst, UR_RESULT_ERROR_INVALID_NULL_HANDLE) - UR_ASSERT(hImageSrc->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_ASSERT(hImageSrc->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hImageDst->mem_type_ == ur_mem_handle_t_::mem_type::surface, + UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hImageSrc->mem_.surface_mem_.get_image_type() == - hImageDst->mem_.surface_mem_.get_image_type(), + UR_ASSERT(hImageSrc->Mem.SurfaceMem.getImageType() == + hImageDst->Mem.SurfaceMem.getImageType(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hQueue->get_context()); - hipStream_t hipStream = hQueue->get_next_transfer_stream(); + ScopedContext Active(hQueue->getContext()); + hipStream_t HIPStream = hQueue->getNextTransferStream(); if (phEventWaitList) { - retErr = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); } - hipArray *srcArray = hImageSrc->mem_.surface_mem_.get_array(); - hipArray_Format srcFormat; - size_t srcNumChannels; - getArrayDesc(srcArray, srcFormat, srcNumChannels); + hipArray *SrcArray = hImageSrc->Mem.SurfaceMem.getArray(); + hipArray_Format SrcFormat; + size_t SrcNumChannels; + getArrayDesc(SrcArray, SrcFormat, SrcNumChannels); - hipArray *dstArray = hImageDst->mem_.surface_mem_.get_array(); - hipArray_Format dstFormat; - size_t dstNumChannels; - getArrayDesc(dstArray, dstFormat, dstNumChannels); + hipArray *DstArray = hImageDst->Mem.SurfaceMem.getArray(); + hipArray_Format DstFormat; + size_t DstNumChannels; + getArrayDesc(DstArray, DstFormat, DstNumChannels); - UR_ASSERT(srcFormat == dstFormat, + UR_ASSERT(SrcFormat == DstFormat, UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - UR_ASSERT(srcNumChannels == dstNumChannels, + UR_ASSERT(SrcNumChannels == DstNumChannels, UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - int elementByteSize = imageElementByteSize(srcFormat); + int ElementByteSize = imageElementByteSize(SrcFormat); - size_t dstByteOffsetX = dstOrigin.x * elementByteSize * srcNumChannels; - size_t srcByteOffsetX = srcOrigin.x * elementByteSize * dstNumChannels; - size_t bytesToCopy = elementByteSize * srcNumChannels * region.depth; + size_t DstByteOffsetX = dstOrigin.x * ElementByteSize * SrcNumChannels; + size_t SrcByteOffsetX = srcOrigin.x * ElementByteSize * DstNumChannels; + size_t BytesToCopy = ElementByteSize * SrcNumChannels * region.depth; - auto imgType = hImageSrc->mem_.surface_mem_.get_image_type(); + auto ImgType = hImageSrc->Mem.SurfaceMem.getImageType(); - size_t adjustedRegion[3] = {bytesToCopy, region.height, region.width}; - size_t srcOffset[3] = {srcByteOffsetX, srcOrigin.y, srcOrigin.z}; - size_t dstOffset[3] = {dstByteOffsetX, dstOrigin.y, dstOrigin.z}; + size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.width}; + size_t SrcOffset[3] = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z}; + size_t DstOffset[3] = {DstByteOffsetX, dstOrigin.y, dstOrigin.z}; - retErr = commonEnqueueMemImageNDCopy( - hipStream, imgType, adjustedRegion, srcArray, hipMemoryTypeArray, - srcOffset, dstArray, hipMemoryTypeArray, dstOffset); + Result = commonEnqueueMemImageNDCopy( + HIPStream, ImgType, AdjustedRegion, SrcArray, hipMemoryTypeArray, + SrcOffset, DstArray, hipMemoryTypeArray, DstOffset); - if (retErr != UR_RESULT_SUCCESS) { - return retErr; + if (Result != UR_RESULT_SUCCESS) { + return Result; } if (phEvent) { - auto new_event = ur_event_handle_t_::make_native( - UR_COMMAND_MEM_IMAGE_COPY, hQueue, hipStream); - new_event->record(); - *phEvent = new_event; + auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_COPY, + hQueue, HIPStream); + NewEvent->record(); + *phEvent = NewEvent; } - } catch (ur_result_t err) { - return err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } @@ -1127,54 +1121,53 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(ppRetMap, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(hBuffer->mem_type_ == ur_mem_handle_t_::mem_type::buffer, + UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_result_t ret_err = UR_RESULT_ERROR_INVALID_OPERATION; - const bool is_pinned = - hBuffer->mem_.buffer_mem_.allocMode_ == - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; + ur_result_t Result = UR_RESULT_ERROR_INVALID_OPERATION; + const bool IsPinned = + hBuffer->Mem.BufferMem.MemAllocMode == + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; // Currently no support for overlapping regions - if (hBuffer->mem_.buffer_mem_.get_map_ptr() != nullptr) { - return ret_err; + if (hBuffer->Mem.BufferMem.getMapPtr() != nullptr) { + return Result; } // Allocate a pointer in the host to store the mapped information - auto hostPtr = hBuffer->mem_.buffer_mem_.map_to_ptr(offset, mapFlags); - *ppRetMap = hBuffer->mem_.buffer_mem_.get_map_ptr(); - if (hostPtr) { - ret_err = UR_RESULT_SUCCESS; + auto HostPtr = hBuffer->Mem.BufferMem.mapToPtr(offset, mapFlags); + *ppRetMap = hBuffer->Mem.BufferMem.getMapPtr(); + if (HostPtr) { + Result = UR_RESULT_SUCCESS; } - if (!is_pinned && + if (!IsPinned && ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) { // Pinned host memory is already on host so it doesn't need to be read. - ret_err = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size, - hostPtr, numEventsInWaitList, - phEventWaitList, phEvent); + Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size, + HostPtr, numEventsInWaitList, + phEventWaitList, phEvent); } else { - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); - if (is_pinned) { - ret_err = urEnqueueEventsWait(hQueue, numEventsInWaitList, - phEventWaitList, nullptr); + if (IsPinned) { + Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList, + nullptr); } if (phEvent) { try { - *phEvent = - ur_event_handle_t_::make_native(UR_COMMAND_MEM_BUFFER_MAP, hQueue, - hQueue->get_next_transfer_stream()); + *phEvent = ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_MAP, hQueue, hQueue->getNextTransferStream()); (*phEvent)->start(); (*phEvent)->record(); - } catch (ur_result_t error) { - ret_err = error; + } catch (ur_result_t Error) { + Result = Error; } } } - return ret_err; + return Result; } /// Implements the unmap from the host, using a BufferWrite operation. @@ -1185,53 +1178,52 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t ret_err = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pMappedPtr, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(hMem->mem_type_ == ur_mem_handle_t_::mem_type::buffer, + UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hMem->mem_.buffer_mem_.get_map_ptr() != nullptr, + UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() != nullptr, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hMem->mem_.buffer_mem_.get_map_ptr() == pMappedPtr, + UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() == pMappedPtr, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - const bool is_pinned = - hMem->mem_.buffer_mem_.allocMode_ == - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; + const bool IsPinned = + hMem->Mem.BufferMem.MemAllocMode == + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; - if (!is_pinned && - ((hMem->mem_.buffer_mem_.get_map_flags() & UR_MAP_FLAG_WRITE) || - (hMem->mem_.buffer_mem_.get_map_flags() & - UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) { + if (!IsPinned && ((hMem->Mem.BufferMem.getMapFlags() & UR_MAP_FLAG_WRITE) || + (hMem->Mem.BufferMem.getMapFlags() & + UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) { // Pinned host memory is only on host so it doesn't need to be written to. - ret_err = urEnqueueMemBufferWrite( - hQueue, hMem, true, hMem->mem_.buffer_mem_.get_map_offset(pMappedPtr), - hMem->mem_.buffer_mem_.get_size(), pMappedPtr, numEventsInWaitList, + Result = urEnqueueMemBufferWrite( + hQueue, hMem, true, hMem->Mem.BufferMem.getMapOffset(pMappedPtr), + hMem->Mem.BufferMem.getSize(), pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent); } else { - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); - if (is_pinned) { - ret_err = urEnqueueEventsWait(hQueue, numEventsInWaitList, - phEventWaitList, nullptr); + if (IsPinned) { + Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList, + nullptr); } if (phEvent) { try { - *phEvent = ur_event_handle_t_::make_native( - UR_COMMAND_MEM_UNMAP, hQueue, hQueue->get_next_transfer_stream()); + *phEvent = ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_UNMAP, hQueue, hQueue->getNextTransferStream()); (*phEvent)->start(); (*phEvent)->record(); - } catch (ur_result_t error) { - ret_err = error; + } catch (ur_result_t Error) { + Result = Error; } } } - hMem->mem_.buffer_mem_.unmap(pMappedPtr); - return ret_err; + hMem->Mem.BufferMem.unmap(pMappedPtr); + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( @@ -1243,38 +1235,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( UR_ASSERT(ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pPattern, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t result = UR_RESULT_SUCCESS; - std::unique_ptr event_ptr{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr EventPtr{nullptr}; try { - ScopedContext active(hQueue->get_context()); - uint32_t stream_token; - ur_stream_quard guard; - hipStream_t hipStream = hQueue->get_next_compute_stream( - numEventsInWaitList, phEventWaitList, guard, &stream_token); - result = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + uint32_t StreamToken; + ur_stream_quard Guard; + hipStream_t HIPStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - event_ptr = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_USM_FILL, hQueue, hipStream, stream_token)); - event_ptr->start(); + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_USM_FILL, hQueue, HIPStream, StreamToken)); + EventPtr->start(); } switch (patternSize) { case 1: - result = UR_CHECK_ERROR( + Result = UR_CHECK_ERROR( hipMemsetD8Async(reinterpret_cast(ptr), - *(const uint8_t *)pPattern & 0xFF, size, hipStream)); + *(const uint8_t *)pPattern & 0xFF, size, HIPStream)); break; case 2: - result = UR_CHECK_ERROR(hipMemsetD16Async( + Result = UR_CHECK_ERROR(hipMemsetD16Async( reinterpret_cast(ptr), - *(const uint16_t *)pPattern & 0xFFFF, size, hipStream)); + *(const uint16_t *)pPattern & 0xFFFF, size, HIPStream)); break; case 4: - result = UR_CHECK_ERROR(hipMemsetD32Async( + Result = UR_CHECK_ERROR(hipMemsetD32Async( reinterpret_cast(ptr), - *(const uint32_t *)pPattern & 0xFFFFFFFF, size, hipStream)); + *(const uint32_t *)pPattern & 0xFFFFFFFF, size, HIPStream)); break; default: @@ -1282,14 +1274,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( } if (phEvent) { - result = event_ptr->record(); - *phEvent = event_ptr.release(); + Result = EventPtr->record(); + *phEvent = EventPtr.release(); } - } catch (ur_result_t err) { - result = err; + } catch (ur_result_t Err) { + Result = Err; } - return result; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( @@ -1300,36 +1292,36 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; - std::unique_ptr event_ptr{nullptr}; + std::unique_ptr EventPtr{nullptr}; try { - ScopedContext active(hQueue->get_context()); - hipStream_t hipStream = hQueue->get_next_transfer_stream(); - result = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + hipStream_t HIPStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - event_ptr = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_USM_MEMCPY, hQueue, hipStream)); - event_ptr->start(); + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_USM_MEMCPY, hQueue, HIPStream)); + EventPtr->start(); } - result = UR_CHECK_ERROR( - hipMemcpyAsync(pDst, pSrc, size, hipMemcpyDefault, hipStream)); + Result = UR_CHECK_ERROR( + hipMemcpyAsync(pDst, pSrc, size, hipMemcpyDefault, HIPStream)); if (phEvent) { - result = event_ptr->record(); + Result = EventPtr->record(); } if (blocking) { - result = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + Result = UR_CHECK_ERROR(hipStreamSynchronize(HIPStream)); } if (phEvent) { - *phEvent = event_ptr.release(); + *phEvent = EventPtr.release(); } - } catch (ur_result_t err) { - result = err; + } catch (ur_result_t Err) { + Result = Err; } - return result; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( @@ -1343,31 +1335,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( // flags is currently unused so fail if set if (flags != 0) return UR_RESULT_ERROR_INVALID_VALUE; - ur_result_t result = UR_RESULT_SUCCESS; - std::unique_ptr event_ptr{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr EventPtr{nullptr}; try { - ScopedContext active(hQueue->get_context()); - hipStream_t hipStream = hQueue->get_next_transfer_stream(); - result = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + hipStream_t HIPStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - event_ptr = - std::unique_ptr(ur_event_handle_t_::make_native( - UR_COMMAND_USM_PREFETCH, hQueue, hipStream)); - event_ptr->start(); + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_USM_PREFETCH, hQueue, HIPStream)); + EventPtr->start(); } - result = UR_CHECK_ERROR(hipMemPrefetchAsync( - pMem, size, hQueue->get_context()->get_device()->get(), hipStream)); + Result = UR_CHECK_ERROR(hipMemPrefetchAsync( + pMem, size, hQueue->getContext()->getDevice()->get(), HIPStream)); if (phEvent) { - result = event_ptr->record(); - *phEvent = event_ptr.release(); + Result = EventPtr->record(); + *phEvent = EventPtr.release(); } - } catch (ur_result_t err) { - result = err; + } catch (ur_result_t Err) { + Result = Err; } - return result; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL @@ -1427,34 +1419,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hQueue->get_context()); - hipStream_t hipStream = hQueue->get_next_transfer_stream(); - result = enqueueEventsWait(hQueue, hipStream, numEventsInWaitList, + ScopedContext Active(hQueue->getContext()); + hipStream_t HIPStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); if (phEvent) { - (*phEvent) = ur_event_handle_t_::make_native(UR_COMMAND_USM_MEMCPY_2D, - hQueue, hipStream); + (*phEvent) = ur_event_handle_t_::makeNative(UR_COMMAND_USM_MEMCPY_2D, + hQueue, HIPStream); (*phEvent)->start(); } - result = + Result = UR_CHECK_ERROR(hipMemcpy2DAsync(pDst, dstPitch, pSrc, srcPitch, width, - height, hipMemcpyDefault, hipStream)); + height, hipMemcpyDefault, HIPStream)); if (phEvent) { (*phEvent)->record(); } if (blocking) { - result = UR_CHECK_ERROR(hipStreamSynchronize(hipStream)); + Result = UR_CHECK_ERROR(hipStreamSynchronize(HIPStream)); } - } catch (ur_result_t err) { - result = err; + } catch (ur_result_t Err) { + Result = Err; } - return result; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp index 8267ef36f54df..c75a4cf52db7c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp @@ -11,154 +11,151 @@ #include "context.hpp" #include "platform.hpp" -ur_event_handle_t_::ur_event_handle_t_(ur_command_t type, - ur_context_handle_t context, - ur_queue_handle_t queue, - hipStream_t stream, - uint32_t stream_token) - : commandType_{type}, refCount_{1}, hasBeenWaitedOn_{false}, - isRecorded_{false}, isStarted_{false}, - streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr}, - evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} { - - bool profilingEnabled = queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE; +ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type, + ur_context_handle_t Context, + ur_queue_handle_t Queue, + hipStream_t Stream, uint32_t StreamToken) + : CommandType{Type}, RefCount{1}, HasBeenWaitedOn{false}, + IsRecorded{false}, IsStarted{false}, + StreamToken{StreamToken}, EventEnd{nullptr}, EvStart{nullptr}, + EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} { + + bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE; UR_CHECK_ERROR(hipEventCreateWithFlags( - &evEnd_, profilingEnabled ? hipEventDefault : hipEventDisableTiming)); + &EventEnd, ProfilingEnabled ? hipEventDefault : hipEventDisableTiming)); - if (profilingEnabled) { - UR_CHECK_ERROR(hipEventCreateWithFlags(&evQueued_, hipEventDefault)); - UR_CHECK_ERROR(hipEventCreateWithFlags(&evStart_, hipEventDefault)); + if (ProfilingEnabled) { + UR_CHECK_ERROR(hipEventCreateWithFlags(&EvQueued, hipEventDefault)); + UR_CHECK_ERROR(hipEventCreateWithFlags(&EvStart, hipEventDefault)); } - if (queue_ != nullptr) { - urQueueRetain(queue_); + if (Queue != nullptr) { + urQueueRetain(Queue); } - urContextRetain(context_); + urContextRetain(Context); } ur_event_handle_t_::~ur_event_handle_t_() { - if (queue_ != nullptr) { - urQueueRelease(queue_); + if (Queue != nullptr) { + urQueueRelease(Queue); } - urContextRelease(context_); + urContextRelease(Context); } ur_result_t ur_event_handle_t_::start() { - assert(!is_started()); - ur_result_t result = UR_RESULT_SUCCESS; + assert(!isStarted()); + ur_result_t Result = UR_RESULT_SUCCESS; try { - if (queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { // NOTE: This relies on the default stream to be unused. - UR_CHECK_ERROR(hipEventRecord(evQueued_, 0)); - UR_CHECK_ERROR(hipEventRecord(evStart_, queue_->get())); + UR_CHECK_ERROR(hipEventRecord(EvQueued, 0)); + UR_CHECK_ERROR(hipEventRecord(EvStart, Queue->get())); } - } catch (ur_result_t error) { - result = error; + } catch (ur_result_t Error) { + Result = Error; } - isStarted_ = true; - return result; + IsStarted = true; + return Result; } -bool ur_event_handle_t_::is_completed() const noexcept { - if (!isRecorded_) { +bool ur_event_handle_t_::isCompleted() const noexcept { + if (!IsRecorded) { return false; } - if (!hasBeenWaitedOn_) { - const hipError_t ret = hipEventQuery(evEnd_); - if (ret != hipSuccess && ret != hipErrorNotReady) { - UR_CHECK_ERROR(ret); + if (!HasBeenWaitedOn) { + const hipError_t Result = hipEventQuery(EventEnd); + if (Result != hipSuccess && Result != hipErrorNotReady) { + UR_CHECK_ERROR(Result); return false; } - if (ret == hipErrorNotReady) { + if (Result == hipErrorNotReady) { return false; } } return true; } -uint64_t ur_event_handle_t_::get_queued_time() const { - float miliSeconds = 0.0f; - assert(is_started()); +uint64_t ur_event_handle_t_::getQueuedTime() const { + float MiliSeconds = 0.0f; + assert(isStarted()); - UR_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, evStart_, evEnd_)); - return static_cast(miliSeconds * 1.0e6); + UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds, EvStart, EventEnd)); + return static_cast(MiliSeconds * 1.0e6); } -uint64_t ur_event_handle_t_::get_start_time() const { - float miliSeconds = 0.0f; - assert(is_started()); +uint64_t ur_event_handle_t_::getStartTime() const { + float MiliSeconds = 0.0f; + assert(isStarted()); - UR_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, - ur_platform_handle_t_::evBase_, evStart_)); - return static_cast(miliSeconds * 1.0e6); + UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds, + ur_platform_handle_t_::EvBase, EvStart)); + return static_cast(MiliSeconds * 1.0e6); } -uint64_t ur_event_handle_t_::get_end_time() const { - float miliSeconds = 0.0f; - assert(is_started() && is_recorded()); +uint64_t ur_event_handle_t_::getEndTime() const { + float MiliSeconds = 0.0f; + assert(isStarted() && isRecorded()); - UR_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, - ur_platform_handle_t_::evBase_, evEnd_)); - return static_cast(miliSeconds * 1.0e6); + UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds, + ur_platform_handle_t_::EvBase, EventEnd)); + return static_cast(MiliSeconds * 1.0e6); } ur_result_t ur_event_handle_t_::record() { - if (is_recorded() || !is_started()) { + if (isRecorded() || !isStarted()) { return UR_RESULT_ERROR_INVALID_EVENT; } - ur_result_t result = UR_RESULT_ERROR_INVALID_OPERATION; + ur_result_t Result = UR_RESULT_ERROR_INVALID_OPERATION; - UR_ASSERT(queue_, UR_RESULT_ERROR_INVALID_QUEUE); + UR_ASSERT(Queue, UR_RESULT_ERROR_INVALID_QUEUE); try { - eventId_ = queue_->get_next_event_id(); - if (eventId_ == 0) { + EventId = Queue->getNextEventId(); + if (EventId == 0) { sycl::detail::ur::die( "Unrecoverable program state reached in event identifier overflow"); } - result = UR_CHECK_ERROR(hipEventRecord(evEnd_, stream_)); - } catch (ur_result_t error) { - result = error; + Result = UR_CHECK_ERROR(hipEventRecord(EventEnd, Stream)); + } catch (ur_result_t Error) { + Result = Error; } - if (result == UR_RESULT_SUCCESS) { - isRecorded_ = true; + if (Result == UR_RESULT_SUCCESS) { + IsRecorded = true; } - return result; + return Result; } ur_result_t ur_event_handle_t_::wait() { - ur_result_t retErr; + ur_result_t Result; try { - retErr = UR_CHECK_ERROR(hipEventSynchronize(evEnd_)); - hasBeenWaitedOn_ = true; - } catch (ur_result_t error) { - retErr = error; + Result = UR_CHECK_ERROR(hipEventSynchronize(EventEnd)); + HasBeenWaitedOn = true; + } catch (ur_result_t Error) { + Result = Error; } - return retErr; + return Result; } ur_result_t ur_event_handle_t_::release() { - assert(queue_ != nullptr); - UR_CHECK_ERROR(hipEventDestroy(evEnd_)); + assert(Queue != nullptr); + UR_CHECK_ERROR(hipEventDestroy(EventEnd)); - if (queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE) { - UR_CHECK_ERROR(hipEventDestroy(evQueued_)); - UR_CHECK_ERROR(hipEventDestroy(evStart_)); + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + UR_CHECK_ERROR(hipEventDestroy(EvQueued)); + UR_CHECK_ERROR(hipEventDestroy(EvStart)); } return UR_RESULT_SUCCESS; } -//////////////////// - UR_APIEXPORT ur_result_t UR_APICALL urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { UR_ASSERT(numEvents > 0, UR_RESULT_ERROR_INVALID_VALUE); @@ -166,28 +163,24 @@ urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { try { - auto context = phEventWaitList[0]->get_context(); - ScopedContext active(context); + auto Context = phEventWaitList[0]->getContext(); + ScopedContext Active(Context); - auto waitFunc = [context](ur_event_handle_t event) -> ur_result_t { - UR_ASSERT(event, UR_RESULT_ERROR_INVALID_EVENT); - UR_ASSERT(event->get_context() == context, + auto WaitFunc = [Context](ur_event_handle_t Event) -> ur_result_t { + UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT); + UR_ASSERT(Event->getContext() == Context, UR_RESULT_ERROR_INVALID_CONTEXT); - return event->wait(); + return Event->wait(); }; - return forLatestEvents(phEventWaitList, numEvents, waitFunc); - } catch (ur_result_t err) { - return err; + return forLatestEvents(phEventWaitList, numEvents, WaitFunc); + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } } -// -// Events -// - UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, ur_event_info_t propName, size_t propValueSize, @@ -199,15 +192,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); switch (propName) { case UR_EVENT_INFO_COMMAND_QUEUE: - return ReturnValue(hEvent->get_queue()); + return ReturnValue(hEvent->getQueue()); case UR_EVENT_INFO_COMMAND_TYPE: - return ReturnValue(hEvent->get_command_type()); + return ReturnValue(hEvent->getCommandType()); case UR_EVENT_INFO_REFERENCE_COUNT: - return ReturnValue(hEvent->get_reference_count()); + return ReturnValue(hEvent->getReferenceCount()); case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS: - return ReturnValue(hEvent->get_execution_status()); + return ReturnValue(hEvent->getExecutionStatus()); case UR_EVENT_INFO_CONTEXT: - return ReturnValue(hEvent->get_context()); + return ReturnValue(hEvent->getContext()); default: break; } @@ -224,9 +217,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_VALUE); - ur_queue_handle_t queue = hEvent->get_queue(); - if (queue == nullptr || - !(queue->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + ur_queue_handle_t Queue = hEvent->getQueue(); + if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } @@ -235,11 +227,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( case UR_PROFILING_INFO_COMMAND_QUEUED: case UR_PROFILING_INFO_COMMAND_SUBMIT: // Note: No user for this case - return ReturnValue(static_cast(hEvent->get_queued_time())); + return ReturnValue(static_cast(hEvent->getQueuedTime())); case UR_PROFILING_INFO_COMMAND_START: - return ReturnValue(static_cast(hEvent->get_start_time())); + return ReturnValue(static_cast(hEvent->getStartTime())); case UR_PROFILING_INFO_COMMAND_END: - return ReturnValue(static_cast(hEvent->get_end_time())); + return ReturnValue(static_cast(hEvent->getEndTime())); default: break; } @@ -260,10 +252,10 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus, UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - const auto refCount = hEvent->increment_reference_count(); + const auto RefCount = hEvent->incrementReferenceCount(); sycl::detail::ur::assertion( - refCount != 0, "Reference count overflow detected in urEventRetain."); + RefCount != 0, "Reference count overflow detected in urEventRetain."); return UR_RESULT_SUCCESS; } @@ -274,20 +266,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { // double delete or someone is messing with the ref count. // either way, cannot safely proceed. sycl::detail::ur::assertion( - hEvent->get_reference_count() != 0, + hEvent->getReferenceCount() != 0, "Reference count overflow detected in urEventRelease."); // decrement ref count. If it is 0, delete the event. - if (hEvent->decrement_reference_count() == 0) { + if (hEvent->decrementReferenceCount() == 0) { std::unique_ptr event_ptr{hEvent}; - ur_result_t result = UR_RESULT_ERROR_INVALID_EVENT; + ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT; try { - ScopedContext active(hEvent->get_context()); - result = hEvent->release(); + ScopedContext Active(hEvent->getContext()); + Result = hEvent->release(); } catch (...) { - result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } - return result; + return Result; } return UR_RESULT_SUCCESS; @@ -295,8 +287,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { /// Gets the native HIP handle of a UR event object /// -/// \param[in] event The UR event to get the native HIP object of. -/// \param[out] nativeHandle Set to the native handle of the UR event object. +/// \param[in] hEvent The UR event to get the native HIP object of. +/// \param[out] phNativeEvent Set to the native handle of the UR event object. /// /// \return UR_RESULT_SUCCESS on success. UR_RESULT_ERROR_INVALID_EVENT if given /// a user event. @@ -313,10 +305,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( /// TODO: Implement this. /// NOTE: The created UR object takes ownership of the native handle. /// -/// \param[in] nativeHandle The native handle to create UR event object from. -/// \param[out] event Set to the UR event object created from native handle. +/// \param[in] hNativeEvent The native handle to create UR event object from. +/// \param[out] phEvent Set to the UR event object created from native handle. /// -/// \return TBD +/// \return UR_RESULT_ERROR_UNSUPPORTED_FEATURE UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( ur_native_handle_t hNativeEvent, ur_context_handle_t hContext, const ur_event_native_properties_t *pProperties, diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp index 3c9700419cd8b..d77b080909de9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp @@ -22,62 +22,62 @@ struct ur_event_handle_t_ { ur_result_t start(); - native_type get() const noexcept { return evEnd_; }; + native_type get() const noexcept { return EventEnd; }; - ur_queue_handle_t get_queue() const noexcept { return queue_; } + ur_queue_handle_t getQueue() const noexcept { return Queue; } - hipStream_t get_stream() const noexcept { return stream_; } + hipStream_t getStream() const noexcept { return Stream; } - uint32_t get_compute_stream_token() const noexcept { return streamToken_; } + uint32_t getComputeStreamToken() const noexcept { return StreamToken; } - ur_command_t get_command_type() const noexcept { return commandType_; } + ur_command_t getCommandType() const noexcept { return CommandType; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } - bool is_recorded() const noexcept { return isRecorded_; } + bool isRecorded() const noexcept { return IsRecorded; } - bool is_started() const noexcept { return isStarted_; } + bool isStarted() const noexcept { return IsStarted; } - bool is_completed() const noexcept; + bool isCompleted() const noexcept; - uint32_t get_execution_status() const noexcept { + uint32_t getExecutionStatus() const noexcept { - if (!is_recorded()) { + if (!isRecorded()) { return UR_EVENT_STATUS_SUBMITTED; } - if (!is_completed()) { + if (!isCompleted()) { return UR_EVENT_STATUS_RUNNING; } return UR_EVENT_STATUS_COMPLETE; } - ur_context_handle_t get_context() const noexcept { return context_; }; + ur_context_handle_t getContext() const noexcept { return Context; }; - uint32_t increment_reference_count() { return ++refCount_; } + uint32_t incrementReferenceCount() { return ++RefCount; } - uint32_t decrement_reference_count() { return --refCount_; } + uint32_t decrementReferenceCount() { return --RefCount; } - uint32_t get_event_id() const noexcept { return eventId_; } + uint32_t getEventId() const noexcept { return EventId; } // Returns the counter time when the associated command(s) were enqueued // - uint64_t get_queued_time() const; + uint64_t getQueuedTime() const; // Returns the counter time when the associated command(s) started execution // - uint64_t get_start_time() const; + uint64_t getStartTime() const; // Returns the counter time when the associated command(s) completed // - uint64_t get_end_time() const; + uint64_t getEndTime() const; // construct a native HIP. This maps closely to the underlying HIP event. static ur_event_handle_t - make_native(ur_command_t type, ur_queue_handle_t queue, hipStream_t stream, - uint32_t stream_token = std::numeric_limits::max()) { - return new ur_event_handle_t_(type, queue->get_context(), queue, stream, - stream_token); + makeNative(ur_command_t Type, ur_queue_handle_t Queue, hipStream_t Stream, + uint32_t StreamToken = std::numeric_limits::max()) { + return new ur_event_handle_t_(Type, Queue->getContext(), Queue, Stream, + StreamToken); } ur_result_t release(); @@ -85,91 +85,89 @@ struct ur_event_handle_t_ { ~ur_event_handle_t_(); private: - // This constructor is private to force programmers to use the make_native / + // This constructor is private to force programmers to use the makeNative / // make_user static members in order to create a ur_event_handle_t for HIP. - ur_event_handle_t_(ur_command_t type, ur_context_handle_t context, - ur_queue_handle_t queue, hipStream_t stream, - uint32_t stream_token); + ur_event_handle_t_(ur_command_t Type, ur_context_handle_t Context, + ur_queue_handle_t Queue, hipStream_t Stream, + uint32_t StreamToken); - ur_command_t commandType_; // The type of command associated with event. + ur_command_t CommandType; // The type of command associated with event. - std::atomic_uint32_t refCount_; // Event reference count. + std::atomic_uint32_t RefCount; // Event reference count. - bool hasBeenWaitedOn_; // Signifies whether the event has been waited - // on through a call to wait(), which implies - // that it has completed. + bool HasBeenWaitedOn; // Signifies whether the event has been waited + // on through a call to wait(), which implies + // that it has completed. - bool isRecorded_; // Signifies wether a native HIP event has been recorded - // yet. - bool isStarted_; // Signifies wether the operation associated with the - // UR event has started or not - // + bool IsRecorded; // Signifies wether a native HIP event has been recorded + // yet. + bool IsStarted; // Signifies wether the operation associated with the + // UR event has started or not + // - uint32_t streamToken_; - uint32_t eventId_; // Queue identifier of the event. + uint32_t StreamToken; + uint32_t EventId; // Queue identifier of the event. - native_type evEnd_; // HIP event handle. If this ur_event_handle_t_ represents - // a user event, this will be nullptr. + native_type EventEnd; // HIP event handle. If this ur_event_handle_t_ + // represents a user event, this will be nullptr. - native_type evStart_; // HIP event handle associated with the start + native_type EvStart; // HIP event handle associated with the start - native_type evQueued_; // HIP event handle associated with the time - // the command was enqueued + native_type EvQueued; // HIP event handle associated with the time + // the command was enqueued - ur_queue_handle_t queue_; // ur_queue_handle_t associated with the event. If - // this is a user event, this will be nullptr. + ur_queue_handle_t Queue; // ur_queue_handle_t associated with the event. If + // this is a user event, this will be nullptr. - hipStream_t stream_; // hipStream_t associated with the event. If this is a - // user event, this will be uninitialized. + hipStream_t Stream; // hipStream_t associated with the event. If this is a + // user event, this will be uninitialized. - ur_context_handle_t - context_; // ur_context_handle_t associated with the event. If this - // is a native event, this will be the same - // context associated with the queue_ member. + ur_context_handle_t Context; // ur_context_handle_t associated with the event. + // If this is a native event, this will be the + // same context associated with the Queue member. }; -// Iterates over the event wait list, returns correct ur_result_t error codes. -// Invokes the callback for the latest event of each queue in the wait list. -// The callback must take a single ur_event_handle_t argument and return a -// ur_result_t. +// Iterate over `EventWaitList` and apply the given callback `F` to the +// latest event on each queue therein. The callback must take a single +// ur_event_handle_t argument and return a ur_result_t. If the callback returns +// an error, the iteration terminates and the error is returned. template -ur_result_t forLatestEvents(const ur_event_handle_t *event_wait_list, - size_t num_events_in_wait_list, Func &&f) { +ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList, + size_t NumEventsInWaitList, Func &&F) { - if (event_wait_list == nullptr || num_events_in_wait_list == 0) { + if (EventWaitList == nullptr || NumEventsInWaitList == 0) { return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } // Fast path if we only have a single event - if (num_events_in_wait_list == 1) { - return f(event_wait_list[0]); + if (NumEventsInWaitList == 1) { + return F(EventWaitList[0]); } - std::vector events{ - event_wait_list, event_wait_list + num_events_in_wait_list}; - std::sort(events.begin(), events.end(), - [](ur_event_handle_t e0, ur_event_handle_t e1) { + std::vector Events{EventWaitList, + EventWaitList + NumEventsInWaitList}; + std::sort(Events.begin(), Events.end(), + [](ur_event_handle_t E0, ur_event_handle_t E1) { // Tiered sort creating sublists of streams (smallest value first) // in which the corresponding events are sorted into a sequence of // newest first. - return e0->get_stream() < e1->get_stream() || - (e0->get_stream() == e1->get_stream() && - e0->get_event_id() > e1->get_event_id()); + return E0->getStream() < E1->getStream() || + (E0->getStream() == E1->getStream() && + E0->getEventId() > E1->getEventId()); }); - bool first = true; - hipStream_t lastSeenStream = 0; - for (ur_event_handle_t event : events) { - if (!event || (!first && event->get_stream() == lastSeenStream)) { + hipStream_t LastSeenStream = 0; + for (size_t i = 0; i < Events.size(); i++) { + auto Event = Events[i]; + if (!Event || (i != 0 && Event->getStream() == LastSeenStream)) { continue; } - first = false; - lastSeenStream = event->get_stream(); + LastSeenStream = Event->getStream(); - auto result = f(event); - if (result != UR_RESULT_SUCCESS) { - return result; + auto Result = F(Event); + if (Result != UR_RESULT_SUCCESS) { + return Result; } } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp index c7909fae2f5d6..0852767c95d05 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp @@ -17,38 +17,38 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, UR_ASSERT(pKernelName, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(phKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t retErr = UR_RESULT_SUCCESS; - std::unique_ptr retKernel{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr RetKernel{nullptr}; try { - ScopedContext active(hProgram->get_context()); + ScopedContext Active(hProgram->getContext()); - hipFunction_t hipFunc; - retErr = UR_CHECK_ERROR( - hipModuleGetFunction(&hipFunc, hProgram->get(), pKernelName)); + hipFunction_t HIPFunc; + Result = UR_CHECK_ERROR( + hipModuleGetFunction(&HIPFunc, hProgram->get(), pKernelName)); - std::string kernel_name_woffset = std::string(pKernelName) + "_with_offset"; - hipFunction_t hipFuncWithOffsetParam; - hipError_t offsetRes = hipModuleGetFunction( - &hipFuncWithOffsetParam, hProgram->get(), kernel_name_woffset.c_str()); + std::string KernelNameWoffset = std::string(pKernelName) + "_with_offset"; + hipFunction_t HIPFuncWithOffsetParam; + hipError_t OffsetRes = hipModuleGetFunction( + &HIPFuncWithOffsetParam, hProgram->get(), KernelNameWoffset.c_str()); // If there is no kernel with global offset parameter we mark it as missing - if (offsetRes == hipErrorNotFound) { - hipFuncWithOffsetParam = nullptr; + if (OffsetRes == hipErrorNotFound) { + HIPFuncWithOffsetParam = nullptr; } else { - retErr = UR_CHECK_ERROR(offsetRes); + Result = UR_CHECK_ERROR(OffsetRes); } - retKernel = std::unique_ptr( - new ur_kernel_handle_t_{hipFunc, hipFuncWithOffsetParam, pKernelName, - hProgram, hProgram->get_context()}); - } catch (ur_result_t err) { - retErr = err; + RetKernel = std::unique_ptr( + new ur_kernel_handle_t_{HIPFunc, HIPFuncWithOffsetParam, pKernelName, + hProgram, hProgram->getContext()}); + } catch (ur_result_t Err) { + Result = Err; } catch (...) { - retErr = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } - *phKernel = retKernel.release(); - return retErr; + *phKernel = RetKernel.release(); + return Result; } UR_APIEXPORT ur_result_t UR_APICALL @@ -57,22 +57,21 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, void *pPropValue, size_t *pPropSizeRet) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - // Here we want to query about a kernel's cuda blocks! UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { - size_t global_work_size[3] = {0, 0, 0}; + size_t GlobalWorkSize[3] = {0, 0, 0}; - int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; + int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0}; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_block_dimX, hipDeviceAttributeMaxBlockDimX, + hipDeviceGetAttribute(&MaxBlockDimX, hipDeviceAttributeMaxBlockDimX, hDevice->get()) == hipSuccess); sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_block_dimY, hipDeviceAttributeMaxBlockDimY, + hipDeviceGetAttribute(&MaxBlockDimY, hipDeviceAttributeMaxBlockDimY, hDevice->get()) == hipSuccess); sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_block_dimZ, hipDeviceAttributeMaxBlockDimZ, + hipDeviceGetAttribute(&MaxBlockDimZ, hipDeviceAttributeMaxBlockDimZ, hDevice->get()) == hipSuccess); int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; @@ -86,18 +85,18 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, hipDeviceGetAttribute(&max_grid_dimZ, hipDeviceAttributeMaxGridDimZ, hDevice->get()) == hipSuccess); - global_work_size[0] = max_block_dimX * max_grid_dimX; - global_work_size[1] = max_block_dimY * max_grid_dimY; - global_work_size[2] = max_block_dimZ * max_grid_dimZ; - return ReturnValue(global_work_size, 3); + GlobalWorkSize[0] = MaxBlockDimX * max_grid_dimX; + GlobalWorkSize[1] = MaxBlockDimY * max_grid_dimY; + GlobalWorkSize[2] = MaxBlockDimZ * max_grid_dimZ; + return ReturnValue(GlobalWorkSize, 3); } case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - int max_threads = 0; + int MaxThreads = 0; sycl::detail::ur::assertion( - hipFuncGetAttribute(&max_threads, + hipFuncGetAttribute(&MaxThreads, HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, hKernel->get()) == hipSuccess); - return ReturnValue(size_t(max_threads)); + return ReturnValue(size_t(MaxThreads)); } case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { size_t group_size[3] = {0, 0, 0}; @@ -111,27 +110,27 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, } case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { // OpenCL LOCAL == HIP SHARED - int bytes = 0; + int Bytes = 0; sycl::detail::ur::assertion( - hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + hipFuncGetAttribute(&Bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, hKernel->get()) == hipSuccess); - return ReturnValue(uint64_t(bytes)); + return ReturnValue(uint64_t(Bytes)); } case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { // Work groups should be multiples of the warp size - int warpSize = 0; + int WarpSize = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, + hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, hDevice->get()) == hipSuccess); - return ReturnValue(static_cast(warpSize)); + return ReturnValue(static_cast(WarpSize)); } case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { // OpenCL PRIVATE == HIP LOCAL - int bytes = 0; + int Bytes = 0; sycl::detail::ur::assertion( - hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, + hipFuncGetAttribute(&Bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get()) == hipSuccess); - return ReturnValue(uint64_t(bytes)); + return ReturnValue(uint64_t(Bytes)); } default: break; @@ -142,10 +141,9 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hKernel->get_reference_count() > 0u, - UR_RESULT_ERROR_INVALID_KERNEL); + UR_ASSERT(hKernel->getReferenceCount() > 0u, UR_RESULT_ERROR_INVALID_KERNEL); - hKernel->increment_reference_count(); + hKernel->incrementReferenceCount(); return UR_RESULT_SUCCESS; } @@ -155,11 +153,10 @@ urKernelRelease(ur_kernel_handle_t hKernel) { // double delete or someone is messing with the ref count. // either way, cannot safely proceed. - UR_ASSERT(hKernel->get_reference_count() != 0, - UR_RESULT_ERROR_INVALID_KERNEL); + UR_ASSERT(hKernel->getReferenceCount() != 0, UR_RESULT_ERROR_INVALID_KERNEL); // decrement ref count. If it is 0, delete the program. - if (hKernel->decrement_reference_count() == 0) { + if (hKernel->decrementReferenceCount() == 0) { // no internal cuda resources to clean up. Just delete it. delete hKernel; return UR_RESULT_SUCCESS; @@ -172,8 +169,8 @@ urKernelRelease(ur_kernel_handle_t hKernel) { // feature. UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( ur_kernel_handle_t hKernel, ur_native_handle_t *phNativeKernel) { - (void)hKernel; - (void)phNativeKernel; + std::ignore = hKernel; + std::ignore = phNativeKernel; return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -183,17 +180,17 @@ urKernelSetArgValue(ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, const void *pArgValue) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { if (pArgValue) { - hKernel->set_kernel_arg(argIndex, argSize, pArgValue); + hKernel->setKernelArg(argIndex, argSize, pArgValue); } else { - hKernel->set_kernel_local_arg(argIndex, argSize); + hKernel->setKernelLocalArg(argIndex, argSize); } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, @@ -207,15 +204,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, switch (propName) { case UR_KERNEL_INFO_FUNCTION_NAME: - return ReturnValue(hKernel->get_name()); + return ReturnValue(hKernel->getName()); case UR_KERNEL_INFO_NUM_ARGS: - return ReturnValue(hKernel->get_num_args()); + return ReturnValue(hKernel->getNumArgs()); case UR_KERNEL_INFO_REFERENCE_COUNT: - return ReturnValue(hKernel->get_reference_count()); + return ReturnValue(hKernel->getReferenceCount()); case UR_KERNEL_INFO_CONTEXT: - return ReturnValue(hKernel->get_context()); + return ReturnValue(hKernel->getContext()); case UR_KERNEL_INFO_PROGRAM: - return ReturnValue(hKernel->get_program()); + return ReturnValue(hKernel->getProgram()); case UR_KERNEL_INFO_ATTRIBUTES: return ReturnValue(""); default: @@ -235,25 +232,25 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, switch (propName) { case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: { // Sub-group size is equivalent to warp size - int warpSize = 0; + int WarpSize = 0; sycl::detail::ur::assertion( - hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, + hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, hDevice->get()) == hipSuccess); - return ReturnValue(static_cast(warpSize)); + return ReturnValue(static_cast(WarpSize)); } case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: { // Number of sub-groups = max block size / warp size + possible remainder - int max_threads = 0; + int MaxThreads = 0; sycl::detail::ur::assertion( - hipFuncGetAttribute(&max_threads, + hipFuncGetAttribute(&MaxThreads, HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, hKernel->get()) == hipSuccess); - int warpSize = 0; + int WarpSize = 0; urKernelGetSubGroupInfo(hKernel, hDevice, UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE, - sizeof(uint32_t), &warpSize, nullptr); - int maxWarps = (max_threads + warpSize - 1) / warpSize; - return ReturnValue(static_cast(maxWarps)); + sizeof(uint32_t), &WarpSize, nullptr); + int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; + return ReturnValue(static_cast(MaxWarps)); } case UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS: { // Return value of 0 => not specified @@ -276,7 +273,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( ur_kernel_handle_t hKernel, uint32_t argIndex, const void *pArgValue) { - hKernel->set_kernel_arg(argIndex, sizeof(pArgValue), pArgValue); + hKernel->setKernelArg(argIndex, sizeof(pArgValue), pArgValue); return UR_RESULT_SUCCESS; } @@ -288,14 +285,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( // Below sets kernel arg when zero-sized buffers are handled. // In such case the corresponding memory is null. if (hArgValue == nullptr) { - hKernel->set_kernel_arg(argIndex, 0, nullptr); + hKernel->setKernelArg(argIndex, 0, nullptr); return UR_RESULT_SUCCESS; } - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - if (hArgValue->mem_type_ == ur_mem_handle_t_::mem_type::surface) { - auto array = hArgValue->mem_.surface_mem_.get_array(); + if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) { + auto array = hArgValue->Mem.SurfaceMem.getArray(); hipArray_Format Format; size_t NumChannels; getArrayDesc(array, Format, NumChannels); @@ -306,18 +303,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( "UR HIP kernels only support images with channel types int32, " "uint32, float, and half."); } - hipSurfaceObject_t hipSurf = hArgValue->mem_.surface_mem_.get_surface(); - hKernel->set_kernel_arg(argIndex, sizeof(hipSurf), (void *)&hipSurf); + hipSurfaceObject_t hipSurf = hArgValue->Mem.SurfaceMem.getSurface(); + hKernel->setKernelArg(argIndex, sizeof(hipSurf), (void *)&hipSurf); } else { - void *hipPtr = hArgValue->mem_.buffer_mem_.get_void(); - hKernel->set_kernel_arg(argIndex, sizeof(void *), (void *)&hipPtr); + void *HIPPtr = hArgValue->Mem.BufferMem.getVoid(); + hKernel->setKernelArg(argIndex, sizeof(void *), (void *)&HIPPtr); } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL @@ -327,20 +324,24 @@ urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, UR_ASSERT(hKernel != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hArgValue != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - uint32_t samplerProps = hArgValue->props_; - hKernel->set_kernel_arg(argIndex, sizeof(uint32_t), (void *)&samplerProps); - } catch (ur_result_t err) { - retErr = err; + uint32_t SamplerProps = hArgValue->Props; + hKernel->setKernelArg(argIndex, sizeof(uint32_t), (void *)&SamplerProps); + } catch (ur_result_t Err) { + Result = Err; } - return retErr; + return Result; } // A NOP for the HIP backend UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, size_t propSize, const void *pPropValue) { + std::ignore = hKernel; + std::ignore = propName; + std::ignore = propSize; + std::ignore = pPropValue; return UR_RESULT_SUCCESS; } @@ -349,5 +350,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( ur_program_handle_t hProgram, const ur_kernel_native_properties_t *pProperties, ur_kernel_handle_t *phKernel) { + std::ignore = hNativeKernel; + std::ignore = hContext; + std::ignore = hProgram; + std::ignore = pProperties; + std::ignore = phKernel; return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp index 53fd8368e34a7..b648f0b9afeee 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp @@ -22,7 +22,7 @@ /// invocation. This is not the case of HIPFunction objects, /// which are simply passed together with the arguments on the invocation. /// The UR Kernel implementation for HIP stores the list of arguments, -/// argument sizes and offsets to emulate the interface of UR Kernel, +/// argument sizes, and offsets to emulate the interface of UR Kernel, /// saving the arguments for the later dispatch. /// Note that in UR API, the Local memory is specified as a size per /// individual argument, but in HIP only the total usage of shared @@ -30,16 +30,15 @@ /// A compiler pass converts the UR API local memory model into the /// HIP shared model. This object simply calculates the total of /// shared memory, and the initial offsets of each parameter. -/// struct ur_kernel_handle_t_ { using native_type = hipFunction_t; - native_type function_; - native_type functionWithOffsetParam_; - std::string name_; - ur_context_handle_t context_; - ur_program_handle_t program_; - std::atomic_uint32_t refCount_; + native_type Function; + native_type FunctionWithOffsetParam; + std::string Name; + ur_context_handle_t Context; + ur_program_handle_t Program; + std::atomic_uint32_t RefCount; /// Structure that holds the arguments to the kernel. /// Note earch argument size is known, since it comes @@ -53,142 +52,143 @@ struct ur_kernel_handle_t_ { using args_t = std::array; using args_size_t = std::vector; using args_index_t = std::vector; - args_t storage_; - args_size_t paramSizes_; - args_index_t indices_; - args_size_t offsetPerIndex_; + args_t Storage; + args_size_t ParamSizes; + args_index_t Indices; + args_size_t OffsetPerIndex; - std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0}; + std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0}; arguments() { // Place the implicit offset index at the end of the indicies collection - indices_.emplace_back(&implicitOffsetArgs_); + Indices.emplace_back(&ImplicitOffsetArgs); } - /// Adds an argument to the kernel. + /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. /// Implicit offset argument is kept at the back of the indices collection. - void add_arg(size_t index, size_t size, const void *arg, - size_t localSize = 0) { - if (index + 2 > indices_.size()) { - // Move implicit offset argument index with the end - indices_.resize(index + 2, indices_.back()); + void addArg(size_t Index, size_t Size, const void *Arg, + size_t LocalSize = 0) { + if (Index + 2 > Indices.size()) { + // Move implicit offset argument Index with the end + Indices.resize(Index + 2, Indices.back()); // Ensure enough space for the new argument - paramSizes_.resize(index + 1); - offsetPerIndex_.resize(index + 1); + ParamSizes.resize(Index + 1); + OffsetPerIndex.resize(Index + 1); } - paramSizes_[index] = size; + ParamSizes[Index] = Size; // calculate the insertion point on the array - size_t insertPos = std::accumulate(std::begin(paramSizes_), - std::begin(paramSizes_) + index, 0); + size_t InsertPos = std::accumulate(std::begin(ParamSizes), + std::begin(ParamSizes) + Index, 0); // Update the stored value for the argument - std::memcpy(&storage_[insertPos], arg, size); - indices_[index] = &storage_[insertPos]; - offsetPerIndex_[index] = localSize; + std::memcpy(&Storage[InsertPos], Arg, Size); + Indices[Index] = &Storage[InsertPos]; + OffsetPerIndex[Index] = LocalSize; } - void add_local_arg(size_t index, size_t size) { - size_t localOffset = this->get_local_size(); + void addLocalArg(size_t Index, size_t Size) { + size_t LocalOffset = this->getLocalSize(); // maximum required alignment is the size of the largest vector type - const size_t max_alignment = sizeof(double) * 16; + const size_t MaxAlignment = sizeof(double) * 16; // for arguments smaller than the maximum alignment simply align to the // size of the argument - const size_t alignment = std::min(max_alignment, size); + const size_t Alignment = std::min(MaxAlignment, Size); // align the argument - size_t alignedLocalOffset = localOffset; - if (localOffset % alignment != 0) { - alignedLocalOffset += alignment - (localOffset % alignment); + size_t AlignedLocalOffset = LocalOffset; + size_t Pad = LocalOffset % Alignment; + if (Pad != 0) { + AlignedLocalOffset += Alignment - Pad; } - add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset), - size + (alignedLocalOffset - localOffset)); + addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), + Size + (AlignedLocalOffset - LocalOffset)); } - void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) { - assert(size == sizeof(std::uint32_t) * 3); - std::memcpy(implicitOffsetArgs_, implicitOffset, size); + void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) { + assert(Size == sizeof(std::uint32_t) * 3); + std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - void clear_local_size() { - std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0); + void clearLocalSize() { + std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0); } - const args_index_t &get_indices() const noexcept { return indices_; } + const args_index_t &getIndices() const noexcept { return Indices; } - uint32_t get_local_size() const { - return std::accumulate(std::begin(offsetPerIndex_), - std::end(offsetPerIndex_), 0); + uint32_t getLocalSize() const { + return std::accumulate(std::begin(OffsetPerIndex), + std::end(OffsetPerIndex), 0); } - } args_; - - ur_kernel_handle_t_(hipFunction_t func, hipFunction_t funcWithOffsetParam, - const char *name, ur_program_handle_t program, - ur_context_handle_t ctxt) - : function_{func}, functionWithOffsetParam_{funcWithOffsetParam}, - name_{name}, context_{ctxt}, program_{program}, refCount_{1} { - urProgramRetain(program_); - urContextRetain(context_); + } Args; + + ur_kernel_handle_t_(hipFunction_t Func, hipFunction_t FuncWithOffsetParam, + const char *Name, ur_program_handle_t Program, + ur_context_handle_t Ctxt) + : Function{Func}, FunctionWithOffsetParam{FuncWithOffsetParam}, + Name{Name}, Context{Ctxt}, Program{Program}, RefCount{1} { + urProgramRetain(Program); + urContextRetain(Context); } - ur_kernel_handle_t_(hipFunction_t func, const char *name, - ur_program_handle_t program, ur_context_handle_t ctxt) - : ur_kernel_handle_t_{func, nullptr, name, program, ctxt} {} + ur_kernel_handle_t_(hipFunction_t Func, const char *Name, + ur_program_handle_t Program, ur_context_handle_t Ctxt) + : ur_kernel_handle_t_{Func, nullptr, Name, Program, Ctxt} {} ~ur_kernel_handle_t_() { - urProgramRelease(program_); - urContextRelease(context_); + urProgramRelease(Program); + urContextRelease(Context); } - ur_program_handle_t get_program() const noexcept { return program_; } + ur_program_handle_t getProgram() const noexcept { return Program; } - uint32_t increment_reference_count() noexcept { return ++refCount_; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - uint32_t decrement_reference_count() noexcept { return --refCount_; } + uint32_t decrementReferenceCount() noexcept { return --RefCount; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } - native_type get() const noexcept { return function_; }; + native_type get() const noexcept { return Function; }; - native_type get_with_offset_parameter() const noexcept { - return functionWithOffsetParam_; + native_type getWithOffsetParameter() const noexcept { + return FunctionWithOffsetParam; }; - bool has_with_offset_parameter() const noexcept { - return functionWithOffsetParam_ != nullptr; + bool hasWithOffsetParameter() const noexcept { + return FunctionWithOffsetParam != nullptr; } - ur_context_handle_t get_context() const noexcept { return context_; }; + ur_context_handle_t getContext() const noexcept { return Context; }; - const char *get_name() const noexcept { return name_.c_str(); } + const char *getName() const noexcept { return Name.c_str(); } - /// Returns the number of arguments, excluding the implicit global offset. + /// Get the number of kernel arguments, excluding the implicit global offset. /// Note this only returns the current known number of arguments, not the /// real one required by the kernel, since this cannot be queried from /// the HIP Driver API - uint32_t get_num_args() const noexcept { return args_.indices_.size() - 1; } + uint32_t getNumArgs() const noexcept { return Args.Indices.size() - 1; } - void set_kernel_arg(int index, size_t size, const void *arg) { - args_.add_arg(index, size, arg); + void setKernelArg(int Index, size_t Size, const void *Arg) { + Args.addArg(Index, Size, Arg); } - void set_kernel_local_arg(int index, size_t size) { - args_.add_local_arg(index, size); + void setKernelLocalArg(int Index, size_t Size) { + Args.addLocalArg(Index, Size); } - void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) { - return args_.set_implicit_offset(size, implicitOffset); + void setImplicitOffsetArg(size_t Size, std::uint32_t *ImplicitOffset) { + return Args.setImplicitOffset(Size, ImplicitOffset); } - const arguments::args_index_t &get_arg_indices() const { - return args_.get_indices(); + const arguments::args_index_t &getArgIndices() const { + return Args.getIndices(); } - uint32_t get_local_size() const noexcept { return args_.get_local_size(); } + uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } - void clear_local_size() { args_.clear_local_size(); } + void clearLocalSize() { Args.clearLocalSize(); } }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp index 8be8035ec0acf..9f13d3b6c7c9f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp @@ -5,59 +5,58 @@ /// Decreases the reference count of the Mem object. /// If this is zero, calls the relevant HIP Free function /// \return UR_RESULT_SUCCESS unless deallocation error -/// UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t ret = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { // Do nothing if there are other references - if (hMem->decrement_reference_count() > 0) { + if (hMem->decrementReferenceCount() > 0) { return UR_RESULT_SUCCESS; } // make sure memObj is released in case UR_CHECK_ERROR throws std::unique_ptr uniqueMemObj(hMem); - if (hMem->is_sub_buffer()) { + if (hMem->isSubBuffer()) { return UR_RESULT_SUCCESS; } - ScopedContext active(uniqueMemObj->get_context()); + ScopedContext Active(uniqueMemObj->getContext()); - if (hMem->mem_type_ == ur_mem_handle_t_::mem_type::buffer) { - switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) { - case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::copy_in: - case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic: - ret = UR_CHECK_ERROR( - hipFree((void *)uniqueMemObj->mem_.buffer_mem_.ptr_)); + if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) { + switch (uniqueMemObj->Mem.BufferMem.MemAllocMode) { + case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn: + case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic: + Result = + UR_CHECK_ERROR(hipFree((void *)uniqueMemObj->Mem.BufferMem.Ptr)); break; - case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::use_host_ptr: - ret = UR_CHECK_ERROR( - hipHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); + case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr: + Result = UR_CHECK_ERROR( + hipHostUnregister(uniqueMemObj->Mem.BufferMem.HostPtr)); break; - case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr: - ret = UR_CHECK_ERROR( - hipFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); + case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr: + Result = + UR_CHECK_ERROR(hipFreeHost(uniqueMemObj->Mem.BufferMem.HostPtr)); }; } - else if (hMem->mem_type_ == ur_mem_handle_t_::mem_type::surface) { - ret = UR_CHECK_ERROR(hipDestroySurfaceObject( - uniqueMemObj->mem_.surface_mem_.get_surface())); - auto array = uniqueMemObj->mem_.surface_mem_.get_array(); - ret = UR_CHECK_ERROR(hipFreeArray(array)); + else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) { + Result = UR_CHECK_ERROR( + hipDestroySurfaceObject(uniqueMemObj->Mem.SurfaceMem.getSurface())); + auto Array = uniqueMemObj->Mem.SurfaceMem.getArray(); + Result = UR_CHECK_ERROR(hipFreeArray(Array)); } - } catch (ur_result_t err) { - ret = err; + } catch (ur_result_t Err) { + Result = Err; } catch (...) { - ret = UR_RESULT_ERROR_OUT_OF_RESOURCES; + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } - if (ret != UR_RESULT_SUCCESS) { + if (Result != UR_RESULT_SUCCESS) { // A reported HIP error is either an implementation or an asynchronous HIP // error for which it is unclear if the function that reported it succeeded // or not. Either way, the state of the program is compromised and likely @@ -71,8 +70,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { /// Creates a UR Memory object using a HIP memory allocation. /// Can trigger a manual copy depending on the mode. -/// \TODO Implement USE_HOST_PTR using hipHostRegister -/// +/// \TODO Implement USE_HOST_PTR using hipHostRegister - See #9789 UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size, const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) { @@ -92,76 +90,75 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( // Currently, USE_HOST_PTR is not implemented using host register // since this triggers a weird segfault after program ends. // Setting this constant to true enables testing that behavior. - const bool enableUseHostPtr = false; - const bool performInitialCopy = + const bool EnableUseHostPtr = false; + const bool PerformInitialCopy = (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || - ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !enableUseHostPtr); - ur_result_t retErr = UR_RESULT_SUCCESS; - ur_mem_handle_t retMemObj = nullptr; + ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !EnableUseHostPtr); + ur_result_t Result = UR_RESULT_SUCCESS; + ur_mem_handle_t RetMemObj = nullptr; try { - ScopedContext active(hContext); - void *ptr; + ScopedContext Active(hContext); + void *Ptr; auto pHost = pProperties ? pProperties->pHost : nullptr; - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode allocMode = - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic; + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode = + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic; - if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && enableUseHostPtr) { - retErr = + if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) { + Result = UR_CHECK_ERROR(hipHostRegister(pHost, size, hipHostRegisterMapped)); - retErr = UR_CHECK_ERROR(hipHostGetDevicePointer(&ptr, pHost, 0)); - allocMode = ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::use_host_ptr; + Result = UR_CHECK_ERROR(hipHostGetDevicePointer(&Ptr, pHost, 0)); + AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr; } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { - retErr = UR_CHECK_ERROR(hipHostMalloc(&pHost, size)); - retErr = UR_CHECK_ERROR(hipHostGetDevicePointer(&ptr, pHost, 0)); - allocMode = - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; + Result = UR_CHECK_ERROR(hipHostMalloc(&pHost, size)); + Result = UR_CHECK_ERROR(hipHostGetDevicePointer(&Ptr, pHost, 0)); + AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; } else { - retErr = UR_CHECK_ERROR(hipMalloc(&ptr, size)); + Result = UR_CHECK_ERROR(hipMalloc(&Ptr, size)); if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { - allocMode = ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::copy_in; + AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn; } } - if (retErr == UR_RESULT_SUCCESS) { + if (Result == UR_RESULT_SUCCESS) { ur_mem_handle_t parentBuffer = nullptr; - auto devPtr = reinterpret_cast< - ur_mem_handle_t_::mem_::mem_::buffer_mem_::native_type>(ptr); - auto urMemObj = std::unique_ptr(new ur_mem_handle_t_{ - hContext, parentBuffer, flags, allocMode, devPtr, pHost, size}); - if (urMemObj != nullptr) { - retMemObj = urMemObj.release(); - if (performInitialCopy) { + auto DevPtr = + reinterpret_cast( + Ptr); + auto URMemObj = std::unique_ptr(new ur_mem_handle_t_{ + hContext, parentBuffer, flags, AllocMode, DevPtr, pHost, size}); + if (URMemObj != nullptr) { + RetMemObj = URMemObj.release(); + if (PerformInitialCopy) { // Operates on the default stream of the current HIP context. - retErr = UR_CHECK_ERROR(hipMemcpyHtoD(devPtr, pHost, size)); + Result = UR_CHECK_ERROR(hipMemcpyHtoD(DevPtr, pHost, size)); // Synchronize with default stream implicitly used by hipMemcpyHtoD // to make buffer data available on device before any other UR call // uses it. - if (retErr == UR_RESULT_SUCCESS) { + if (Result == UR_RESULT_SUCCESS) { hipStream_t defaultStream = 0; - retErr = UR_CHECK_ERROR(hipStreamSynchronize(defaultStream)); + Result = UR_CHECK_ERROR(hipStreamSynchronize(defaultStream)); } } } else { - retErr = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } } - } catch (ur_result_t err) { - retErr = err; + } catch (ur_result_t Err) { + Result = Err; } catch (...) { - retErr = UR_RESULT_ERROR_OUT_OF_RESOURCES; + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } - *phBuffer = retMemObj; + *phBuffer = RetMemObj; - return retErr; + return Result; } /// Implements a buffer partition in the HIP backend. /// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented /// as an offset over an existing HIP allocation. -/// UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( ur_mem_handle_t hBuffer, ur_mem_flags_t flags, ur_buffer_create_type_t bufferCreateType, const ur_buffer_region_t *pRegion, @@ -169,8 +166,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, UR_RESULT_ERROR_INVALID_ENUMERATION); - UR_ASSERT(hBuffer->is_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(!hBuffer->is_sub_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(!hBuffer->isSubBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); // Default value for flags means UR_MEM_FLAG_READ_WRITE. if (flags == 0) { @@ -181,11 +178,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)), UR_RESULT_ERROR_INVALID_VALUE); - if (hBuffer->memFlags_ & UR_MEM_FLAG_WRITE_ONLY) { + if (hBuffer->MemFlags & UR_MEM_FLAG_WRITE_ONLY) { UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_READ_ONLY)), UR_RESULT_ERROR_INVALID_VALUE); } - if (hBuffer->memFlags_ & UR_MEM_FLAG_READ_ONLY) { + if (hBuffer->MemFlags & UR_MEM_FLAG_READ_ONLY) { UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)), UR_RESULT_ERROR_INVALID_VALUE); } @@ -197,44 +194,44 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); - UR_ASSERT(((pRegion->origin + pRegion->size) <= - hBuffer->mem_.buffer_mem_.get_size()), - UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + UR_ASSERT( + ((pRegion->origin + pRegion->size) <= hBuffer->Mem.BufferMem.getSize()), + UR_RESULT_ERROR_INVALID_BUFFER_SIZE); // Retained indirectly due to retaining parent buffer below. - ur_context_handle_t context = hBuffer->context_; - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode allocMode = - ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic; + ur_context_handle_t Context = hBuffer->Context; + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode = + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic; - UR_ASSERT(hBuffer->mem_.buffer_mem_.ptr_ != - ur_mem_handle_t_::mem_::buffer_mem_::native_type{0}, + UR_ASSERT(hBuffer->Mem.BufferMem.Ptr != + ur_mem_handle_t_::MemImpl::BufferMem::native_type{0}, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_mem_handle_t_::mem_::buffer_mem_::native_type ptr = - hBuffer->mem_.buffer_mem_.get_with_offset(pRegion->origin); + ur_mem_handle_t_::MemImpl::BufferMem::native_type Ptr = + hBuffer->Mem.BufferMem.getWithOffset(pRegion->origin); - void *hostPtr = nullptr; - if (hBuffer->mem_.buffer_mem_.hostPtr_) { - hostPtr = static_cast(hBuffer->mem_.buffer_mem_.hostPtr_) + - pRegion->origin; + void *HostPtr = nullptr; + if (hBuffer->Mem.BufferMem.HostPtr) { + HostPtr = + static_cast(hBuffer->Mem.BufferMem.HostPtr) + pRegion->origin; } - ReleaseGuard releaseGuard(hBuffer); + ReleaseGuard ReleaseGuard(hBuffer); - std::unique_ptr retMemObj{nullptr}; + std::unique_ptr RetMemObj{nullptr}; try { - ScopedContext active(context); + ScopedContext Active(Context); - retMemObj = std::unique_ptr{new ur_mem_handle_t_{ - context, hBuffer, flags, allocMode, ptr, hostPtr, pRegion->size}}; - } catch (ur_result_t err) { + RetMemObj = std::unique_ptr{new ur_mem_handle_t_{ + Context, hBuffer, flags, AllocMode, Ptr, HostPtr, pRegion->size}}; + } catch (ur_result_t Err) { *phMem = nullptr; - return err; + return Err; } catch (...) { *phMem = nullptr; return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } - releaseGuard.dismiss(); - *phMem = retMemObj.release(); + ReleaseGuard.dismiss(); + *phMem = RetMemObj.release(); return UR_RESULT_SUCCESS; } @@ -247,27 +244,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, UR_ASSERT(hMemory, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(MemInfoType <= UR_MEM_INFO_CONTEXT, UR_RESULT_ERROR_INVALID_ENUMERATION); - UR_ASSERT(hMemory->is_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hMemory->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet); - ScopedContext active(hMemory->get_context()); + ScopedContext Active(hMemory->getContext()); switch (MemInfoType) { case UR_MEM_INFO_SIZE: { try { - size_t allocSize = 0; - UR_CHECK_ERROR(hipMemGetAddressRange(nullptr, &allocSize, - hMemory->mem_.buffer_mem_.ptr_)); - return ReturnValue(allocSize); - } catch (ur_result_t err) { - return err; + size_t AllocSize = 0; + UR_CHECK_ERROR(hipMemGetAddressRange(nullptr, &AllocSize, + hMemory->Mem.BufferMem.Ptr)); + return ReturnValue(AllocSize); + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } } case UR_MEM_INFO_CONTEXT: { - return ReturnValue(hMemory->get_context()); + return ReturnValue(hMemory->getContext()); } default: @@ -277,31 +274,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, /// Gets the native HIP handle of a UR mem object /// -/// \param[in] mem The UR mem to get the native HIP object of. -/// \param[out] nativeHandle Set to the native handle of the UR mem object. +/// \param[in] hMem The UR mem to get the native HIP object of. +/// \param[out] phNativeMem Set to the native handle of the UR mem object. /// /// \return UR_RESULT_SUCCESS UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) { #if defined(__HIP_PLATFORM_NVIDIA__) - if (sizeof(ur_mem_handle_t_::mem_::buffer_mem_::native_type) > + if (sizeof(ur_mem_handle_t_::MemImpl::BufferMem::native_type) > sizeof(ur_native_handle_t)) { // Check that all the upper bits that cannot be represented by // ur_native_handle_t are empty. // NOTE: The following shift might trigger a warning, but the check in the // if above makes sure that this does not underflow. - ur_mem_handle_t_::mem_::buffer_mem_::native_type upperBits = - hMem->mem_.buffer_mem_.get() >> (sizeof(ur_native_handle_t) * CHAR_BIT); - if (upperBits) { + ur_mem_handle_t_::MemImpl::BufferMem::native_type UpperBits = + hMem->Mem.BufferMem.get() >> (sizeof(ur_native_handle_t) * CHAR_BIT); + if (UpperBits) { // Return an error if any of the remaining bits is non-zero. return UR_RESULT_ERROR_INVALID_MEM_OBJECT; } } *phNativeMem = - reinterpret_cast(hMem->mem_.buffer_mem_.get()); + reinterpret_cast(hMem->Mem.BufferMem.get()); #elif defined(__HIP_PLATFORM_AMD__) *phNativeMem = - reinterpret_cast(hMem->mem_.buffer_mem_.get()); + reinterpret_cast(hMem->Mem.BufferMem.get()); #else #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); #endif @@ -348,7 +345,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( UR_ASSERT(pHost, UR_RESULT_ERROR_INVALID_HOST_PTR); } - const bool performInitialCopy = + const bool PerformInitialCopy = (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || ((flags & UR_MEM_FLAG_USE_HOST_POINTER)); @@ -367,7 +364,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); } - ur_result_t retErr = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; // We only support RBGA channel order // TODO: check SYCL CTS and spec. May also have to support BGRA @@ -377,59 +374,59 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( // We have to use hipArray3DCreate, which has some caveats. The height and // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives // a minimum value of 1, so we need to convert the answer. - HIP_ARRAY3D_DESCRIPTOR array_desc; - array_desc.NumChannels = 4; // Only support 4 channel image - array_desc.Flags = 0; // No flags required - array_desc.Width = pImageDesc->width; + HIP_ARRAY3D_DESCRIPTOR ArrayDesc; + ArrayDesc.NumChannels = 4; // Only support 4 channel image + ArrayDesc.Flags = 0; // No flags required + ArrayDesc.Width = pImageDesc->width; if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - array_desc.Height = 0; - array_desc.Depth = 0; + ArrayDesc.Height = 0; + ArrayDesc.Depth = 0; } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - array_desc.Height = pImageDesc->height; - array_desc.Depth = 0; + ArrayDesc.Height = pImageDesc->height; + ArrayDesc.Depth = 0; } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - array_desc.Height = pImageDesc->height; - array_desc.Depth = pImageDesc->depth; + ArrayDesc.Height = pImageDesc->height; + ArrayDesc.Depth = pImageDesc->depth; } // We need to get this now in bytes for calculating the total image size later - size_t pixel_type_size_bytes; + size_t PixelTypeSizeBytes; switch (pImageFormat->channelType) { case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: - array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT8; - pixel_type_size_bytes = 1; + ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT8; + PixelTypeSizeBytes = 1; break; case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: - array_desc.Format = HIP_AD_FORMAT_SIGNED_INT8; - pixel_type_size_bytes = 1; + ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT8; + PixelTypeSizeBytes = 1; break; case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: - array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT16; - pixel_type_size_bytes = 2; + ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT16; + PixelTypeSizeBytes = 2; break; case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: - array_desc.Format = HIP_AD_FORMAT_SIGNED_INT16; - pixel_type_size_bytes = 2; + ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT16; + PixelTypeSizeBytes = 2; break; case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: - array_desc.Format = HIP_AD_FORMAT_HALF; - pixel_type_size_bytes = 2; + ArrayDesc.Format = HIP_AD_FORMAT_HALF; + PixelTypeSizeBytes = 2; break; case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: - array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT32; - pixel_type_size_bytes = 4; + ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT32; + PixelTypeSizeBytes = 4; break; case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: - array_desc.Format = HIP_AD_FORMAT_SIGNED_INT32; - pixel_type_size_bytes = 4; + ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT32; + PixelTypeSizeBytes = 4; break; case UR_IMAGE_CHANNEL_TYPE_FLOAT: - array_desc.Format = HIP_AD_FORMAT_FLOAT; - pixel_type_size_bytes = 4; + ArrayDesc.Format = HIP_AD_FORMAT_FLOAT; + PixelTypeSizeBytes = 4; break; default: // urMemImageCreate given unsupported image_channel_data_type @@ -437,43 +434,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( } // When a dimension isn't used image_desc has the size set to 1 - size_t pixel_size_bytes = - pixel_type_size_bytes * 4; // 4 is the only number of channels we support - size_t image_size_bytes = pixel_size_bytes * pImageDesc->width * - pImageDesc->height * pImageDesc->depth; + size_t PixelSizeBytes = + PixelTypeSizeBytes * 4; // 4 is the only number of channels we support + size_t ImageSizeBytes = PixelSizeBytes * pImageDesc->width * + pImageDesc->height * pImageDesc->depth; - ScopedContext active(hContext); - hipArray *image_array; - retErr = UR_CHECK_ERROR(hipArray3DCreate( - reinterpret_cast(&image_array), &array_desc)); + ScopedContext Active(hContext); + hipArray *ImageArray; + Result = UR_CHECK_ERROR(hipArray3DCreate( + reinterpret_cast(&ImageArray), &ArrayDesc)); try { - if (performInitialCopy) { + if (PerformInitialCopy) { // We have to use a different copy function for each image dimensionality if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - retErr = UR_CHECK_ERROR( - hipMemcpyHtoA(image_array, 0, pHost, image_size_bytes)); + Result = + UR_CHECK_ERROR(hipMemcpyHtoA(ImageArray, 0, pHost, ImageSizeBytes)); } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - hip_Memcpy2D cpy_desc; - memset(&cpy_desc, 0, sizeof(cpy_desc)); - cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost; - cpy_desc.srcHost = pHost; - cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray; - cpy_desc.dstArray = reinterpret_cast(image_array); - cpy_desc.WidthInBytes = pixel_size_bytes * pImageDesc->width; - cpy_desc.Height = pImageDesc->height; - retErr = UR_CHECK_ERROR(hipMemcpyParam2D(&cpy_desc)); + hip_Memcpy2D CpyDesc; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + CpyDesc.srcMemoryType = hipMemoryType::hipMemoryTypeHost; + CpyDesc.srcHost = pHost; + CpyDesc.dstMemoryType = hipMemoryType::hipMemoryTypeArray; + CpyDesc.dstArray = reinterpret_cast(ImageArray); + CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width; + CpyDesc.Height = pImageDesc->height; + Result = UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc)); } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - HIP_MEMCPY3D cpy_desc; - memset(&cpy_desc, 0, sizeof(cpy_desc)); - cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost; - cpy_desc.srcHost = pHost; - cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray; - cpy_desc.dstArray = reinterpret_cast(image_array); - cpy_desc.WidthInBytes = pixel_size_bytes * pImageDesc->width; - cpy_desc.Height = pImageDesc->height; - cpy_desc.Depth = pImageDesc->depth; - retErr = UR_CHECK_ERROR(hipDrvMemcpy3D(&cpy_desc)); + HIP_MEMCPY3D CpyDesc; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + CpyDesc.srcMemoryType = hipMemoryType::hipMemoryTypeHost; + CpyDesc.srcHost = pHost; + CpyDesc.dstMemoryType = hipMemoryType::hipMemoryTypeArray; + CpyDesc.dstArray = reinterpret_cast(ImageArray); + CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width; + CpyDesc.Height = pImageDesc->height; + CpyDesc.Depth = pImageDesc->depth; + Result = UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc)); } } @@ -484,29 +481,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( // handle. // HIP_RESOURCE_DESC::flags must be set to zero - hipResourceDesc image_res_desc; - image_res_desc.res.array.array = image_array; - image_res_desc.resType = hipResourceTypeArray; + hipResourceDesc ImageResDesc; + ImageResDesc.res.array.array = ImageArray; + ImageResDesc.resType = hipResourceTypeArray; - hipSurfaceObject_t surface; - retErr = UR_CHECK_ERROR(hipCreateSurfaceObject(&surface, &image_res_desc)); + hipSurfaceObject_t Surface; + Result = UR_CHECK_ERROR(hipCreateSurfaceObject(&Surface, &ImageResDesc)); - auto urMemObj = std::unique_ptr(new ur_mem_handle_t_{ - hContext, image_array, surface, flags, pImageDesc->type, pHost}); + auto URMemObj = std::unique_ptr(new ur_mem_handle_t_{ + hContext, ImageArray, Surface, flags, pImageDesc->type, pHost}); - if (urMemObj == nullptr) { + if (URMemObj == nullptr) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } - *phMem = urMemObj.release(); - } catch (ur_result_t err) { - UR_CHECK_ERROR(hipFreeArray(image_array)); - return err; + *phMem = URMemObj.release(); + } catch (ur_result_t Err) { + UR_CHECK_ERROR(hipFreeArray(ImageArray)); + return Err; } catch (...) { - UR_CHECK_ERROR(hipFreeArray(image_array)); + UR_CHECK_ERROR(hipFreeArray(ImageArray)); return UR_RESULT_ERROR_UNKNOWN; } - return retErr; + return Result; } /// \TODO Not implemented @@ -524,8 +521,7 @@ urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t ImgInfoType, UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hMem->get_reference_count() > 0, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - hMem->increment_reference_count(); + UR_ASSERT(hMem->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_MEM_OBJECT); + hMem->incrementReferenceCount(); return UR_RESULT_SUCCESS; } \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp index 9403fc565dfe8..95439609070e2 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp @@ -20,182 +20,177 @@ struct ur_mem_handle_t_ { using ur_context = ur_context_handle_t_ *; using ur_mem = ur_mem_handle_t_ *; - // Context where the memory object is accessibles - ur_context context_; + // Context where the memory object is accessible + ur_context Context; /// Reference counting of the handler - std::atomic_uint32_t refCount_; - enum class mem_type { buffer, surface } mem_type_; + std::atomic_uint32_t RefCount; + enum class Type { Buffer, Surface } MemType; // Original mem flags passed - ur_mem_flags_t memFlags_; + ur_mem_flags_t MemFlags; /// A UR Memory object represents either plain memory allocations ("Buffers" /// in OpenCL) or typed allocations ("Images" in OpenCL). /// In HIP their API handlers are different. Whereas "Buffers" are allocated - /// as pointer-like structs, "Images" are stored in Textures or Surfaces + /// as pointer-like structs, "Images" are stored in Textures or Surfaces. /// This union allows implementation to use either from the same handler. - union mem_ { + union MemImpl { // Handler for plain, pointer-based HIP allocations - struct buffer_mem_ { + struct BufferMem { using native_type = hipDeviceptr_t; // If this allocation is a sub-buffer (i.e., a view on an existing // allocation), this is the pointer to the parent handler structure - ur_mem parent_; + ur_mem Parent; // HIP handler for the pointer - native_type ptr_; + native_type Ptr; /// Pointer associated with this device on the host - void *hostPtr_; + void *HostPtr; /// Size of the allocation in bytes - size_t size_; + size_t Size; /// Offset of the active mapped region. - size_t mapOffset_; + size_t MapOffset; /// Pointer to the active mapped region, if any - void *mapPtr_; + void *MapPtr; /// Original flags for the mapped region - ur_map_flags_t mapFlags_; + ur_map_flags_t MapFlags; - /** alloc_mode - * classic: Just a normal buffer allocated on the device via hip malloc - * use_host_ptr: Use an address on the host for the device - * copy_in: The data for the device comes from the host but the host + /** AllocMode + * Classic: Just a normal buffer allocated on the device via hip malloc + * UseHostPtr: Use an address on the host for the device + * CopyIn: The data for the device comes from the host but the host pointer is not available later for re-use - * alloc_host_ptr: Uses pinned-memory allocation + * AllocHostPtr: Uses pinned-memory allocation */ - enum class alloc_mode { - classic, - use_host_ptr, - copy_in, - alloc_host_ptr - } allocMode_; - - native_type get() const noexcept { return ptr_; } - - native_type get_with_offset(size_t offset) const noexcept { - return reinterpret_cast(reinterpret_cast(ptr_) + - offset); + enum class AllocMode { + Classic, + UseHostPtr, + CopyIn, + AllocHostPtr + } MemAllocMode; + + native_type get() const noexcept { return Ptr; } + + native_type getWithOffset(size_t Offset) const noexcept { + return reinterpret_cast(reinterpret_cast(Ptr) + + Offset); } - void *get_void() const noexcept { return reinterpret_cast(ptr_); } + void *getVoid() const noexcept { return reinterpret_cast(Ptr); } - size_t get_size() const noexcept { return size_; } + size_t getSize() const noexcept { return Size; } - void *get_map_ptr() const noexcept { return mapPtr_; } + void *getMapPtr() const noexcept { return MapPtr; } - size_t get_map_offset(void *ptr) const noexcept { - (void)ptr; - return mapOffset_; + size_t getMapOffset(void *Ptr) const noexcept { + std::ignore = Ptr; + return MapOffset; } /// Returns a pointer to data visible on the host that contains /// the data on the device associated with this allocation. /// The offset is used to index into the HIP allocation. /// - void *map_to_ptr(size_t offset, ur_map_flags_t flags) noexcept { - assert(mapPtr_ == nullptr); - mapOffset_ = offset; - mapFlags_ = flags; - if (hostPtr_) { - mapPtr_ = static_cast(hostPtr_) + offset; + void *mapToPtr(size_t Offset, ur_map_flags_t Flags) noexcept { + assert(MapPtr == nullptr); + MapOffset = Offset; + MapFlags = Flags; + if (HostPtr) { + MapPtr = static_cast(HostPtr) + Offset; } else { // TODO: Allocate only what is needed based on the offset - mapPtr_ = static_cast(malloc(this->get_size())); + MapPtr = static_cast(malloc(this->getSize())); } - return mapPtr_; + return MapPtr; } /// Detach the allocation from the host memory. - void unmap(void *ptr) noexcept { - (void)ptr; - assert(mapPtr_ != nullptr); + void unmap(void *Ptr) noexcept { + std::ignore = Ptr; + assert(MapPtr != nullptr); - if (mapPtr_ != hostPtr_) { - free(mapPtr_); + if (MapPtr != HostPtr) { + free(MapPtr); } - mapPtr_ = nullptr; - mapOffset_ = 0; + MapPtr = nullptr; + MapOffset = 0; } - ur_map_flags_t get_map_flags() const noexcept { - assert(mapPtr_ != nullptr); - return mapFlags_; + ur_map_flags_t getMapFlags() const noexcept { + assert(MapPtr != nullptr); + return MapFlags; } - } buffer_mem_; + } BufferMem; // Handler data for surface object (i.e. Images) - struct surface_mem_ { - hipArray *array_; - hipSurfaceObject_t surfObj_; - ur_mem_type_t imageType_; + struct SurfaceMem { + hipArray *Array; + hipSurfaceObject_t SurfObj; + ur_mem_type_t ImageType; - hipArray *get_array() const noexcept { return array_; } + hipArray *getArray() const noexcept { return Array; } - hipSurfaceObject_t get_surface() const noexcept { return surfObj_; } + hipSurfaceObject_t getSurface() const noexcept { return SurfObj; } - ur_mem_type_t get_image_type() const noexcept { return imageType_; } - } surface_mem_; - } mem_; + ur_mem_type_t getImageType() const noexcept { return ImageType; } + } SurfaceMem; + } Mem; /// Constructs the UR MEM handler for a non-typed allocation ("buffer") - ur_mem_handle_t_(ur_context ctxt, ur_mem parent, ur_mem_flags_t mem_flags, - mem_::buffer_mem_::alloc_mode mode, hipDeviceptr_t ptr, - void *host_ptr, size_t size) - : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer}, - memFlags_{mem_flags} { - mem_.buffer_mem_.ptr_ = ptr; - mem_.buffer_mem_.parent_ = parent; - mem_.buffer_mem_.hostPtr_ = host_ptr; - mem_.buffer_mem_.size_ = size; - mem_.buffer_mem_.mapOffset_ = 0; - mem_.buffer_mem_.mapPtr_ = nullptr; - mem_.buffer_mem_.mapFlags_ = UR_MAP_FLAG_WRITE; - mem_.buffer_mem_.allocMode_ = mode; - if (is_sub_buffer()) { - urMemRetain(mem_.buffer_mem_.parent_); + ur_mem_handle_t_(ur_context Ctxt, ur_mem Parent, ur_mem_flags_t MemFlags, + MemImpl::BufferMem::AllocMode Mode, hipDeviceptr_t Ptr, + void *HostPtr, size_t Size) + : Context{Ctxt}, RefCount{1}, MemType{Type::Buffer}, MemFlags{MemFlags} { + Mem.BufferMem.Ptr = Ptr; + Mem.BufferMem.Parent = Parent; + Mem.BufferMem.HostPtr = HostPtr; + Mem.BufferMem.Size = Size; + Mem.BufferMem.MapOffset = 0; + Mem.BufferMem.MapPtr = nullptr; + Mem.BufferMem.MapFlags = UR_MAP_FLAG_WRITE; + Mem.BufferMem.MemAllocMode = Mode; + if (isSubBuffer()) { + urMemRetain(Mem.BufferMem.Parent); } else { - urContextRetain(context_); + urContextRetain(Context); } }; /// Constructs the UR allocation for an Image object - ur_mem_handle_t_(ur_context ctxt, hipArray *array, hipSurfaceObject_t surf, - ur_mem_flags_t mem_flags, ur_mem_type_t image_type, - void *host_ptr) - : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface}, - memFlags_{mem_flags} { - (void)host_ptr; - mem_.surface_mem_.array_ = array; - mem_.surface_mem_.imageType_ = image_type; - mem_.surface_mem_.surfObj_ = surf; - urContextRetain(context_); + ur_mem_handle_t_(ur_context Ctxt, hipArray *Array, hipSurfaceObject_t Surf, + ur_mem_flags_t MemFlags, ur_mem_type_t ImageType, + void *HostPtr) + : Context{Ctxt}, RefCount{1}, MemType{Type::Surface}, MemFlags{MemFlags} { + std::ignore = HostPtr; + Mem.SurfaceMem.Array = Array; + Mem.SurfaceMem.ImageType = ImageType; + Mem.SurfaceMem.SurfObj = Surf; + urContextRetain(Context); } ~ur_mem_handle_t_() { - if (mem_type_ == mem_type::buffer) { - if (is_sub_buffer()) { - urMemRelease(mem_.buffer_mem_.parent_); - return; - } + if (isBuffer() && isSubBuffer()) { + urMemRelease(Mem.BufferMem.Parent); + return; } - urContextRelease(context_); + urContextRelease(Context); } - // TODO: Move as many shared funcs up as possible - bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; } + bool isBuffer() const noexcept { return MemType == Type::Buffer; } - bool is_sub_buffer() const noexcept { - return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr)); + bool isSubBuffer() const noexcept { + return (isBuffer() && (Mem.BufferMem.Parent != nullptr)); } - bool is_image() const noexcept { return mem_type_ == mem_type::surface; } + bool isImage() const noexcept { return MemType == Type::Surface; } - ur_context get_context() const noexcept { return context_; } + ur_context getContext() const noexcept { return Context; } - uint32_t increment_reference_count() noexcept { return ++refCount_; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - uint32_t decrement_reference_count() noexcept { return --refCount_; } + uint32_t decrementReferenceCount() noexcept { return --RefCount; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp index 8cd9bda305cb4..f9885446ef43a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp @@ -8,16 +8,16 @@ #include "platform.hpp" -hipEvent_t ur_platform_handle_t_::evBase_{nullptr}; +hipEvent_t ur_platform_handle_t_::EvBase{nullptr}; -UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo( - ur_platform_handle_t hPlatform, ur_platform_info_t PlatformInfoType, - size_t Size, void *pPlatformInfo, size_t *pSizeRet) { +UR_DLLEXPORT ur_result_t UR_APICALL +urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName, + size_t propSize, void *pPropValue, size_t *pSizeRet) { UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UrReturnHelper ReturnValue(Size, pPlatformInfo, pSizeRet); + UrReturnHelper ReturnValue(propSize, pPropValue, pSizeRet); - switch (PlatformInfoType) { + switch (propName) { case UR_PLATFORM_INFO_NAME: return ReturnValue("AMD HIP BACKEND"); case UR_PLATFORM_INFO_VENDOR_NAME: @@ -48,76 +48,75 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo( /// /// However because multiple devices in a context is not currently supported, /// place each device in a separate platform. -/// UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) { try { - static std::once_flag initFlag; - static uint32_t numPlatforms = 1; - static std::vector platformIds; + static std::once_flag InitFlag; + static uint32_t NumPlatforms = 1; + static std::vector PlatformIds; UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE); UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_VALUE); - ur_result_t err = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; std::call_once( - initFlag, - [](ur_result_t &err) { + InitFlag, + [](ur_result_t &Err) { if (hipInit(0) != hipSuccess) { - numPlatforms = 0; + NumPlatforms = 0; return; } - int numDevices = 0; - err = UR_CHECK_ERROR(hipGetDeviceCount(&numDevices)); - if (numDevices == 0) { - numPlatforms = 0; + int NumDevices = 0; + Err = UR_CHECK_ERROR(hipGetDeviceCount(&NumDevices)); + if (NumDevices == 0) { + NumPlatforms = 0; return; } try { // make one platform per device - numPlatforms = numDevices; - platformIds.resize(numDevices); - - for (int i = 0; i < numDevices; ++i) { - hipDevice_t device; - err = UR_CHECK_ERROR(hipDeviceGet(&device, i)); - platformIds[i].devices_.emplace_back( - new ur_device_handle_t_{device, &platformIds[i]}); + NumPlatforms = NumDevices; + PlatformIds.resize(NumDevices); + + for (int i = 0; i < NumDevices; ++i) { + hipDevice_t Device; + Err = UR_CHECK_ERROR(hipDeviceGet(&Device, i)); + PlatformIds[i].Devices.emplace_back( + new ur_device_handle_t_{Device, &PlatformIds[i]}); } } catch (const std::bad_alloc &) { // Signal out-of-memory situation - for (int i = 0; i < numDevices; ++i) { - platformIds[i].devices_.clear(); + for (int i = 0; i < NumDevices; ++i) { + PlatformIds[i].Devices.clear(); } - platformIds.clear(); - err = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + PlatformIds.clear(); + Err = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { // Clear and rethrow to allow retry - for (int i = 0; i < numDevices; ++i) { - platformIds[i].devices_.clear(); + for (int i = 0; i < NumDevices; ++i) { + PlatformIds[i].Devices.clear(); } - platformIds.clear(); + PlatformIds.clear(); throw; } }, - err); + Result); if (pNumPlatforms != nullptr) { - *pNumPlatforms = numPlatforms; + *pNumPlatforms = NumPlatforms; } if (phPlatforms != nullptr) { - for (unsigned i = 0; i < std::min(NumEntries, numPlatforms); ++i) { - phPlatforms[i] = &platformIds[i]; + for (unsigned i = 0; i < std::min(NumEntries, NumPlatforms); ++i) { + phPlatforms[i] = &PlatformIds[i]; } } - return err; - } catch (ur_result_t err) { - return err; + return Result; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } @@ -140,14 +139,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) { return UR_RESULT_SUCCESS; } -// Returns plugin specific backend option. +// Get CUDA plugin specific backend option. // Current support is only for optimization options. // Return empty string for cuda. // TODO: Determine correct string to be passed. UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption( ur_platform_handle_t hPlatform, const char *pFrontendOption, const char **ppPlatformOption) { - (void)hPlatform; + std::ignore = hPlatform; using namespace std::literals; if (pFrontendOption == nullptr) return UR_RESULT_ERROR_INVALID_NULL_POINTER; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp index cf9c80c2365f5..86e24d952cc78 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.hpp @@ -18,6 +18,6 @@ /// when devices are used. /// struct ur_platform_handle_t_ { - static hipEvent_t evBase_; // HIP event used as base counter - std::vector> devices_; + static hipEvent_t EvBase; // HIP event used as base counter + std::vector> Devices; }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp index 147ea15f32621..a35a9b0200321 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp @@ -8,59 +8,58 @@ #include "program.hpp" -ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t ctxt) - : module_{nullptr}, binary_{}, - binarySizeInBytes_{0}, refCount_{1}, context_{ctxt} { - urContextRetain(context_); +ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt) + : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, Context{ + Ctxt} { + urContextRetain(Context); } -ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(context_); } +ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); } -ur_result_t ur_program_handle_t_::set_binary(const char *source, - size_t length) { +ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) { // Do not re-set program binary data which has already been set as that will // delete the old binary data. - UR_ASSERT(binary_ == nullptr && binarySizeInBytes_ == 0, + UR_ASSERT(Binary == nullptr && BinarySizeInBytes == 0, UR_RESULT_ERROR_INVALID_OPERATION); - binary_ = source; - binarySizeInBytes_ = length; + Binary = Source; + BinarySizeInBytes = Length; return UR_RESULT_SUCCESS; } -ur_result_t ur_program_handle_t_::build_program(const char *build_options) { - if (build_options) { - this->buildOptions_ = build_options; +ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) { + if (BuildOptions) { + this->BuildOptions = BuildOptions; } - constexpr const unsigned int numberOfOptions = 4u; + constexpr const unsigned int NumberOfOptions = 4u; - hipJitOption options[numberOfOptions]; - void *optionVals[numberOfOptions]; + hipJitOption Options[NumberOfOptions]; + void *OptionVals[NumberOfOptions]; // Pass a buffer for info messages - options[0] = hipJitOptionInfoLogBuffer; - optionVals[0] = (void *)infoLog_; + Options[0] = hipJitOptionInfoLogBuffer; + OptionVals[0] = (void *)InfoLog; // Pass the size of the info buffer - options[1] = hipJitOptionInfoLogBufferSizeBytes; - optionVals[1] = (void *)(long)MAX_LOG_SIZE; + Options[1] = hipJitOptionInfoLogBufferSizeBytes; + OptionVals[1] = (void *)(long)MAX_LOG_SIZE; // Pass a buffer for error message - options[2] = hipJitOptionErrorLogBuffer; - optionVals[2] = (void *)errorLog_; + Options[2] = hipJitOptionErrorLogBuffer; + OptionVals[2] = (void *)ErrorLog; // Pass the size of the error buffer - options[3] = hipJitOptionErrorLogBufferSizeBytes; - optionVals[3] = (void *)(long)MAX_LOG_SIZE; + Options[3] = hipJitOptionErrorLogBufferSizeBytes; + OptionVals[3] = (void *)(long)MAX_LOG_SIZE; - auto result = UR_CHECK_ERROR( - hipModuleLoadDataEx(&module_, static_cast(binary_), - numberOfOptions, options, optionVals)); + auto Result = UR_CHECK_ERROR( + hipModuleLoadDataEx(&Module, static_cast(Binary), + NumberOfOptions, Options, OptionVals)); - const auto success = (result == UR_RESULT_SUCCESS); + const auto Success = (Result == UR_RESULT_SUCCESS); - buildStatus_ = - success ? UR_PROGRAM_BUILD_STATUS_SUCCESS : UR_PROGRAM_BUILD_STATUS_ERROR; + BuildStatus = + Success ? UR_PROGRAM_BUILD_STATUS_SUCCESS : UR_PROGRAM_BUILD_STATUS_ERROR; // If no exception, result is correct - return success ? UR_RESULT_SUCCESS : UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE; + return Success ? UR_RESULT_SUCCESS : UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE; } /// Finds kernel names by searching for entry points in the PTX source, as the @@ -75,14 +74,13 @@ ur_result_t getKernelNames(ur_program_handle_t) { /// HIP will handle the PTX/HIPBIN binaries internally through hipModule_t /// object. So, urProgramCreateWithIL and urProgramCreateWithBinary are /// equivalent in terms of HIP adapter. See \ref urProgramCreateWithBinary. -/// UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, size_t length, const ur_program_properties_t *pProperties, ur_program_handle_t *phProgram) { UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_device_handle_t hDevice = hContext->get_device(); + ur_device_handle_t hDevice = hContext->getDevice(); auto pBinary = reinterpret_cast(pIL); return urProgramCreateWithBinary(hContext, hDevice, length, pBinary, @@ -91,7 +89,7 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, /// HIP will handle the PTX/HIPBIN binaries internally through a call to /// hipModuleLoadDataEx. So, urProgramCompile and urProgramBuild are equivalent -/// in terms of CUDA adapter. \TODO Implement asynchronous compilation +/// in terms of HIP adapter. \TODO Implement asynchronous compilation /// UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, @@ -99,26 +97,25 @@ urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, return urProgramBuild(hContext, hProgram, pOptions); } -/// Loads the images from a UR program into a CUmodule that can be +/// Loads the images from a UR program into a hipModule_t that can be /// used later on to extract functions (kernels). /// See \ref ur_program_handle_t for implementation details. -/// UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, ur_program_handle_t hProgram, const char *pOptions) { UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t retError = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hProgram->get_context()); + ScopedContext Active(hProgram->getContext()); - hProgram->build_program(pOptions); + hProgram->buildProgram(pOptions); - } catch (ur_result_t err) { - retError = err; + } catch (ur_result_t Err) { + Result = Err; } - return retError; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL @@ -132,11 +129,11 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count, /// TODO: Implement this. /// NOTE: The created UR object takes ownership of the native handle. /// -/// \param[in] nativeHandle The native handle to create UR program object from. -/// \param[in] context The UR context of the program. -/// \param[out] program Set to the UR program object created from native handle. +/// \param[in] hNativeProgram The native handle to create UR program object +/// from. \param[in] hContext The UR context of the program. \param[out] +/// phProgram Set to the UR program object created from native handle. /// -/// \return TBD +/// \return UR_RESULT_ERROR_UNSUPPORTED_FEATURE UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( ur_native_handle_t hNativeProgram, ur_context_handle_t hContext, const ur_program_native_properties_t *pProperties, @@ -149,7 +146,7 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice, ur_program_build_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { // Ignore unused parameter - (void)hDevice; + std::ignore = hDevice; UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); @@ -157,12 +154,12 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice, switch (propName) { case UR_PROGRAM_BUILD_INFO_STATUS: { - return ReturnValue(hProgram->buildStatus_); + return ReturnValue(hProgram->BuildStatus); } case UR_PROGRAM_BUILD_INFO_OPTIONS: - return ReturnValue(hProgram->buildOptions_.c_str()); + return ReturnValue(hProgram->BuildOptions.c_str()); case UR_PROGRAM_BUILD_INFO_LOG: - return ReturnValue(hProgram->infoLog_, hProgram->MAX_LOG_SIZE); + return ReturnValue(hProgram->InfoLog, hProgram->MAX_LOG_SIZE); default: break; } @@ -178,19 +175,19 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, switch (propName) { case UR_PROGRAM_INFO_REFERENCE_COUNT: - return ReturnValue(hProgram->get_reference_count()); + return ReturnValue(hProgram->getReferenceCount()); case UR_PROGRAM_INFO_CONTEXT: - return ReturnValue(hProgram->context_); + return ReturnValue(hProgram->Context); case UR_PROGRAM_INFO_NUM_DEVICES: return ReturnValue(1u); case UR_PROGRAM_INFO_DEVICES: - return ReturnValue(&hProgram->context_->deviceId_, 1); + return ReturnValue(&hProgram->Context->DeviceId, 1); case UR_PROGRAM_INFO_SOURCE: - return ReturnValue(hProgram->binary_); + return ReturnValue(hProgram->Binary); case UR_PROGRAM_INFO_BINARY_SIZES: - return ReturnValue(&hProgram->binarySizeInBytes_, 1); + return ReturnValue(&hProgram->BinarySizeInBytes, 1); case UR_PROGRAM_INFO_BINARIES: - return ReturnValue(&hProgram->binary_, 1); + return ReturnValue(&hProgram->Binary, 1); case UR_PROGRAM_INFO_KERNEL_NAMES: return getKernelNames(hProgram); default: @@ -200,11 +197,10 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, } UR_APIEXPORT ur_result_t UR_APICALL -urProgramRetain(ur_program_handle_t program) { - UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(program->get_reference_count() > 0, - UR_RESULT_ERROR_INVALID_PROGRAM); - program->increment_reference_count(); +urProgramRetain(ur_program_handle_t hProgram) { + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hProgram->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_PROGRAM); + hProgram->incrementReferenceCount(); return UR_RESULT_SUCCESS; } @@ -212,30 +208,30 @@ urProgramRetain(ur_program_handle_t program) { /// When the reference count reaches 0, it unloads the module from /// the context. UR_APIEXPORT ur_result_t UR_APICALL -urProgramRelease(ur_program_handle_t program) { - UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); +urProgramRelease(ur_program_handle_t hProgram) { + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); // double delete or someone is messing with the ref count. // either way, cannot safely proceed. - UR_ASSERT(program->get_reference_count() != 0, + UR_ASSERT(hProgram->getReferenceCount() != 0, UR_RESULT_ERROR_INVALID_PROGRAM); // decrement ref count. If it is 0, delete the program. - if (program->decrement_reference_count() == 0) { + if (hProgram->decrementReferenceCount() == 0) { - std::unique_ptr program_ptr{program}; + std::unique_ptr ProgramPtr{hProgram}; - ur_result_t result = UR_RESULT_ERROR_INVALID_PROGRAM; + ur_result_t Result = UR_RESULT_ERROR_INVALID_PROGRAM; try { - ScopedContext active(program->get_context()); - auto hipModule = program->get(); - result = UR_CHECK_ERROR(hipModuleUnload(hipModule)); + ScopedContext Active(hProgram->getContext()); + auto HIPModule = hProgram->get(); + Result = UR_CHECK_ERROR(hipModuleUnload(HIPModule)); } catch (...) { - result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } - return result; + return Result; } return UR_RESULT_SUCCESS; @@ -243,14 +239,15 @@ urProgramRelease(ur_program_handle_t program) { /// Gets the native HIP handle of a UR program object /// -/// \param[in] program The UR program to get the native HIP object of. -/// \param[out] nativeHandle Set to the native handle of the UR program object. +/// \param[in] hProgram The UR program to get the native HIP object of. +/// \param[out] phNativeProgram Set to the native handle of the UR program +/// object. /// -/// \return TBD +/// \return UR_RESULT_SUCCESS UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( - ur_program_handle_t program, ur_native_handle_t *nativeHandle) { - UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - *nativeHandle = reinterpret_cast(program->get()); + ur_program_handle_t hProgram, ur_native_handle_t *phNativeProgram) { + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + *phNativeProgram = reinterpret_cast(hProgram->get()); return UR_RESULT_SUCCESS; } @@ -259,7 +256,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( /// for later. /// /// Note: Only supports one device -/// UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, const uint8_t *pBinary, const ur_program_properties_t *pProperties, @@ -268,12 +264,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY); - UR_ASSERT(hContext->get_device()->get() == hDevice->get(), + UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), UR_RESULT_ERROR_INVALID_CONTEXT); - ur_result_t retError = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; - std::unique_ptr retProgram{ + std::unique_ptr RetProgram{ new ur_program_handle_t_{hContext}}; // TODO: Set metadata here and use reqd_work_group_size information. @@ -286,16 +282,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE); - retError = retProgram->set_binary(pBinary_string, size); - UR_ASSERT(retError == UR_RESULT_SUCCESS, retError); + Result = RetProgram->setBinary(pBinary_string, size); + UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); - *phProgram = retProgram.release(); + *phProgram = RetProgram.release(); - return retError; + return Result; } // This entry point is only used for native specialization constants (SPIR-V), -// and the CUDA plugin is AOT only so this entry point is not supported. +// and the HIP plugin is AOT only so this entry point is not supported. UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants( ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -307,21 +303,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( // Check if device passed is the same the device bound to the context UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hDevice == hProgram->get_context()->get_device(), + UR_ASSERT(hDevice == hProgram->getContext()->getDevice(), UR_RESULT_ERROR_INVALID_DEVICE); UR_ASSERT(ppFunctionPointer, UR_RESULT_ERROR_INVALID_NULL_POINTER); - hipFunction_t func; - hipError_t ret = hipModuleGetFunction(&func, hProgram->get(), pFunctionName); - *ppFunctionPointer = func; - ur_result_t retError = UR_RESULT_SUCCESS; + hipFunction_t Func; + hipError_t Ret = hipModuleGetFunction(&Func, hProgram->get(), pFunctionName); + *ppFunctionPointer = Func; + ur_result_t Result = UR_RESULT_SUCCESS; - if (ret != hipSuccess && ret != hipErrorNotFound) - retError = UR_CHECK_ERROR(ret); - if (ret == hipErrorNotFound) { + if (Ret != hipSuccess && Ret != hipErrorNotFound) + Result = UR_CHECK_ERROR(Ret); + if (Ret == hipErrorNotFound) { *ppFunctionPointer = 0; - retError = UR_RESULT_ERROR_INVALID_FUNCTION_NAME; + Result = UR_RESULT_ERROR_INVALID_FUNCTION_NAME; } - return retError; + return Result; } \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp index 9e144798ad0d6..b895c206479d5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp @@ -17,31 +17,31 @@ /// struct ur_program_handle_t_ { using native_type = hipModule_t; - native_type module_; - const char *binary_; - size_t binarySizeInBytes_; - std::atomic_uint32_t refCount_; - ur_context_handle_t context_; + native_type Module; + const char *Binary; + size_t BinarySizeInBytes; + std::atomic_uint32_t RefCount; + ur_context_handle_t Context; constexpr static size_t MAX_LOG_SIZE = 8192u; - char errorLog_[MAX_LOG_SIZE], infoLog_[MAX_LOG_SIZE]; - std::string buildOptions_; - ur_program_build_status_t buildStatus_ = UR_PROGRAM_BUILD_STATUS_NONE; + char ErrorLog[MAX_LOG_SIZE], InfoLog[MAX_LOG_SIZE]; + std::string BuildOptions; + ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE; - ur_program_handle_t_(ur_context_handle_t ctxt); + ur_program_handle_t_(ur_context_handle_t Ctxt); ~ur_program_handle_t_(); - ur_result_t set_binary(const char *binary, size_t binarySizeInBytes); + ur_result_t setBinary(const char *Binary, size_t BinarySizeInBytes); - ur_result_t build_program(const char *build_options); - ur_context_handle_t get_context() const { return context_; }; + ur_result_t buildProgram(const char *BuildOptions); + ur_context_handle_t getContext() const { return Context; }; - native_type get() const noexcept { return module_; }; + native_type get() const noexcept { return Module; }; - uint32_t increment_reference_count() noexcept { return ++refCount_; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - uint32_t decrement_reference_count() noexcept { return --refCount_; } + uint32_t decrementReferenceCount() noexcept { return --RefCount; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp index fb1305e155b19..3c0422afac2ce 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp @@ -10,105 +10,101 @@ #include "context.hpp" #include "event.hpp" -void ur_queue_handle_t_::compute_stream_wait_for_barrier_if_needed( - hipStream_t stream, uint32_t stream_i) { - if (barrier_event_ && !compute_applied_barrier_[stream_i]) { - UR_CHECK_ERROR(hipStreamWaitEvent(stream, barrier_event_, 0)); - compute_applied_barrier_[stream_i] = true; +void ur_queue_handle_t_::computeStreamWaitForBarrierIfNeeded( + hipStream_t Stream, uint32_t Stream_i) { + if (BarrierEvent && !ComputeAppliedBarrier[Stream_i]) { + UR_CHECK_ERROR(hipStreamWaitEvent(Stream, BarrierEvent, 0)); + ComputeAppliedBarrier[Stream_i] = true; } } -void ur_queue_handle_t_::transfer_stream_wait_for_barrier_if_needed( - hipStream_t stream, uint32_t stream_i) { - if (barrier_event_ && !transfer_applied_barrier_[stream_i]) { - UR_CHECK_ERROR(hipStreamWaitEvent(stream, barrier_event_, 0)); - transfer_applied_barrier_[stream_i] = true; +void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded( + hipStream_t Stream, uint32_t Stream_i) { + if (BarrierEvent && !TransferAppliedBarrier[Stream_i]) { + UR_CHECK_ERROR(hipStreamWaitEvent(Stream, BarrierEvent, 0)); + TransferAppliedBarrier[Stream_i] = true; } } -hipStream_t -ur_queue_handle_t_::get_next_compute_stream(uint32_t *stream_token) { - uint32_t stream_i; - uint32_t token; +hipStream_t ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { + uint32_t Stream_i; + uint32_t Token; while (true) { - if (num_compute_streams_ < compute_streams_.size()) { + if (NumComputeStreams < ComputeStreams.size()) { // the check above is for performance - so as not to lock mutex every time - std::lock_guard guard(compute_stream_mutex_); + std::lock_guard guard(ComputeStreamMutex); // The second check is done after mutex is locked so other threads can not - // change num_compute_streams_ after that - if (num_compute_streams_ < compute_streams_.size()) { + // change NumComputeStreams after that + if (NumComputeStreams < ComputeStreams.size()) { UR_CHECK_ERROR(hipStreamCreateWithFlags( - &compute_streams_[num_compute_streams_++], flags_)); + &ComputeStreams[NumComputeStreams++], Flags)); } } - token = compute_stream_idx_++; - stream_i = token % compute_streams_.size(); + Token = ComputeStreamIdx++; + Stream_i = Token % ComputeStreams.size(); // if a stream has been reused before it was next selected round-robin // fashion, we want to delay its next use and instead select another one // that is more likely to have completed all the enqueued work. - if (delay_compute_[stream_i]) { - delay_compute_[stream_i] = false; + if (DelayCompute[Stream_i]) { + DelayCompute[Stream_i] = false; } else { break; } } - if (stream_token) { - *stream_token = token; + if (StreamToken) { + *StreamToken = Token; } - hipStream_t res = compute_streams_[stream_i]; - compute_stream_wait_for_barrier_if_needed(res, stream_i); - return res; + hipStream_t Res = ComputeStreams[Stream_i]; + computeStreamWaitForBarrierIfNeeded(Res, Stream_i); + return Res; } -hipStream_t ur_queue_handle_t_::get_next_compute_stream( - uint32_t num_events_in_wait_list, const ur_event_handle_t *event_wait_list, - ur_stream_quard &guard, uint32_t *stream_token) { - for (uint32_t i = 0; i < num_events_in_wait_list; i++) { - uint32_t token = event_wait_list[i]->get_compute_stream_token(); - if (event_wait_list[i]->get_queue() == this && can_reuse_stream(token)) { - std::unique_lock compute_sync_guard( - compute_stream_sync_mutex_); +hipStream_t ur_queue_handle_t_::getNextComputeStream( + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_stream_quard &Guard, uint32_t *StreamToken) { + for (uint32_t i = 0; i < NumEventsInWaitList; i++) { + uint32_t Token = EventWaitList[i]->getComputeStreamToken(); + if (EventWaitList[i]->getQueue() == this && canReuseStream(Token)) { + std::unique_lock ComputeSyncGuard(ComputeStreamSyncMutex); // redo the check after lock to avoid data races on - // last_sync_compute_streams_ - if (can_reuse_stream(token)) { - uint32_t stream_i = token % delay_compute_.size(); - delay_compute_[stream_i] = true; - if (stream_token) { - *stream_token = token; + // LastSyncComputeStreams + if (canReuseStream(Token)) { + uint32_t Stream_i = Token % DelayCompute.size(); + DelayCompute[Stream_i] = true; + if (StreamToken) { + *StreamToken = Token; } - guard = ur_stream_quard{std::move(compute_sync_guard)}; - hipStream_t res = event_wait_list[i]->get_stream(); - compute_stream_wait_for_barrier_if_needed(res, stream_i); - return res; + Guard = ur_stream_quard{std::move(ComputeSyncGuard)}; + hipStream_t Res = EventWaitList[i]->getStream(); + computeStreamWaitForBarrierIfNeeded(Res, Stream_i); + return Res; } } } - guard = {}; - return get_next_compute_stream(stream_token); + Guard = {}; + return getNextComputeStream(StreamToken); } -hipStream_t ur_queue_handle_t_::get_next_transfer_stream() { - if (transfer_streams_.empty()) { // for example in in-order queue - return get_next_compute_stream(); +hipStream_t ur_queue_handle_t_::getNextTransferStream() { + if (TransferStreams.empty()) { // for example in in-order queue + return getNextComputeStream(); } - if (num_transfer_streams_ < transfer_streams_.size()) { + if (NumTransferStreams < TransferStreams.size()) { // the check above is for performance - so as not to lock mutex every time - std::lock_guard guard(transfer_stream_mutex_); + std::lock_guard Guard(TransferStreamMutex); // The second check is done after mutex is locked so other threads can not - // change num_transfer_streams_ after that - if (num_transfer_streams_ < transfer_streams_.size()) { + // change NumTransferStreams after that + if (NumTransferStreams < TransferStreams.size()) { UR_CHECK_ERROR(hipStreamCreateWithFlags( - &transfer_streams_[num_transfer_streams_++], flags_)); + &TransferStreams[NumTransferStreams++], Flags)); } } - uint32_t stream_i = transfer_stream_idx_++ % transfer_streams_.size(); - hipStream_t res = transfer_streams_[stream_i]; - transfer_stream_wait_for_barrier_if_needed(res, stream_i); - return res; + uint32_t Stream_i = TransferStreamIdx++ % TransferStreams.size(); + hipStream_t Res = TransferStreams[Stream_i]; + transferStreamWaitForBarrierIfNeeded(Res, Stream_i); + return Res; } -/////////////////////////////// - UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) { @@ -117,36 +113,33 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_ASSERT(phQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); try { - std::unique_ptr queueImpl{nullptr}; + std::unique_ptr QueueImpl{nullptr}; - if (hContext->get_device() != hDevice) { + if (hContext->getDevice() != hDevice) { *phQueue = nullptr; return UR_RESULT_ERROR_INVALID_DEVICE; } - unsigned int flags = 0; + unsigned int Flags = 0; - const bool is_out_of_order = + const bool IsOutOfOrder = pProps->flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; - std::vector computeHipStreams( - is_out_of_order ? ur_queue_handle_t_::default_num_compute_streams : 1); - std::vector transferHipStreams( - is_out_of_order ? ur_queue_handle_t_::default_num_transfer_streams : 0); + std::vector ComputeHipStreams( + IsOutOfOrder ? ur_queue_handle_t_::DefaultNumComputeStreams : 1); + std::vector TransferHipStreams( + IsOutOfOrder ? ur_queue_handle_t_::DefaultNumTransferStreams : 0); - queueImpl = std::unique_ptr(new ur_queue_handle_t_{ - std::move(computeHipStreams), std::move(transferHipStreams), hContext, - hDevice, flags, pProps->flags}); + QueueImpl = std::unique_ptr(new ur_queue_handle_t_{ + std::move(ComputeHipStreams), std::move(TransferHipStreams), hContext, + hDevice, Flags, pProps->flags}); - *phQueue = queueImpl.release(); + *phQueue = QueueImpl.release(); return UR_RESULT_SUCCESS; - } catch (ur_result_t err) { - - return err; - + } catch (ur_result_t Err) { + return Err; } catch (...) { - return UR_RESULT_ERROR_OUT_OF_RESOURCES; } } @@ -161,21 +154,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); switch (propName) { case UR_QUEUE_INFO_CONTEXT: - return ReturnValue(hQueue->context_); + return ReturnValue(hQueue->Context); case UR_QUEUE_INFO_DEVICE: - return ReturnValue(hQueue->device_); + return ReturnValue(hQueue->Device); case UR_QUEUE_INFO_REFERENCE_COUNT: - return ReturnValue(hQueue->get_reference_count()); + return ReturnValue(hQueue->getReferenceCount()); case UR_QUEUE_INFO_FLAGS: - return ReturnValue(hQueue->ur_flags_); + return ReturnValue(hQueue->URFlags); case UR_QUEUE_INFO_EMPTY: { - bool IsReady = hQueue->all_of([](hipStream_t s) -> bool { - const hipError_t ret = hipStreamQuery(s); - if (ret == hipSuccess) + bool IsReady = hQueue->allOf([](hipStream_t S) -> bool { + const hipError_t Ret = hipStreamQuery(S); + if (Ret == hipSuccess) return true; try { - UR_CHECK_ERROR(ret); + UR_CHECK_ERROR(Ret); } catch (...) { return false; } @@ -192,32 +185,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hQueue->get_reference_count() > 0, UR_RESULT_ERROR_INVALID_QUEUE); + UR_ASSERT(hQueue->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_QUEUE); - hQueue->increment_reference_count(); + hQueue->incrementReferenceCount(); return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - if (hQueue->decrement_reference_count() > 0) { + if (hQueue->decrementReferenceCount() > 0) { return UR_RESULT_SUCCESS; } try { - std::unique_ptr queueImpl(hQueue); + std::unique_ptr QueueImpl(hQueue); - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); - hQueue->for_each_stream([](hipStream_t s) { - UR_CHECK_ERROR(hipStreamSynchronize(s)); - UR_CHECK_ERROR(hipStreamDestroy(s)); + hQueue->forEachStream([](hipStream_t S) { + UR_CHECK_ERROR(hipStreamSynchronize(S)); + UR_CHECK_ERROR(hipStreamDestroy(S)); }); return UR_RESULT_SUCCESS; - } catch (ur_result_t err) { - return err; + } catch (ur_result_t Err) { + return Err; } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } @@ -227,26 +220,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); // set default result to a negative result (avoid false-positve tests) - ur_result_t result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + ur_result_t Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; try { - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); - hQueue->sync_streams([&result](hipStream_t s) { - result = UR_CHECK_ERROR(hipStreamSynchronize(s)); + hQueue->syncStreams([&Result](hipStream_t S) { + Result = UR_CHECK_ERROR(hipStreamSynchronize(S)); }); - } catch (ur_result_t err) { - - result = err; - + } catch (ur_result_t Err) { + Result = Err; } catch (...) { - - result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } - return result; + return Result; } // There is no HIP counterpart for queue flushing and we don't run into the @@ -259,8 +249,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { /// Gets the native HIP handle of a UR queue object /// -/// \param[in] queue The UR queue to get the native HIP object of. -/// \param[out] nativeHandle Set to the native handle of the UR queue object. +/// \param[in] hQueue The UR queue to get the native HIP object of. +/// \param[out] phNativeQueue Set to the native handle of the UR queue object. /// /// \return UR_RESULT_SUCCESS UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( @@ -268,9 +258,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ScopedContext active(hQueue->get_context()); + ScopedContext Active(hQueue->getContext()); *phNativeQueue = - reinterpret_cast(hQueue->get_next_compute_stream()); + reinterpret_cast(hQueue->getNextComputeStream()); return UR_RESULT_SUCCESS; } @@ -278,14 +268,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( /// TODO: Implement this. /// NOTE: The created UR object takes ownership of the native handle. /// -/// \param[in] nativeHandle The native handle to create UR queue object from. -/// \param[in] context is the UR context of the queue. -/// \param[out] queue Set to the UR queue object created from native handle. -/// \param ownNativeHandle tells if SYCL RT should assume the ownership of +/// \param[in] hNativeQueue The native handle to create UR queue object from. +/// \param[in] hContext is the UR context of the queue. +/// \param[out] phQueue Set to the UR queue object created from native handle. +/// \param pProperties->isNativeHandleOwned tells if SYCL RT should assume the +/// ownership of /// the native handle, if it can. /// -/// -/// \return TBD +/// \return UR_RESULT_ERROR_UNSUPPORTED_FEATURE UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( ur_native_handle_t hNativeQueue, ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties, diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp index f391f1cc82a7c..2378f7b3d8315 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp @@ -15,100 +15,100 @@ using ur_stream_quard = std::unique_lock; /// struct ur_queue_handle_t_ { using native_type = hipStream_t; - static constexpr int default_num_compute_streams = 64; - static constexpr int default_num_transfer_streams = 16; + static constexpr int DefaultNumComputeStreams = 64; + static constexpr int DefaultNumTransferStreams = 16; - std::vector compute_streams_; - std::vector transfer_streams_; - // delay_compute_ keeps track of which streams have been recently reused and + std::vector ComputeStreams; + std::vector TransferStreams; + // DelayCompute keeps track of which streams have been recently reused and // their next use should be delayed. If a stream has been recently reused it // will be skipped the next time it would be selected round-robin style. When // skipped, its delay flag is cleared. - std::vector delay_compute_; + std::vector DelayCompute; // keep track of which streams have applied barrier - std::vector compute_applied_barrier_; - std::vector transfer_applied_barrier_; - ur_context_handle_t context_; - ur_device_handle_t device_; - hipEvent_t barrier_event_ = nullptr; - hipEvent_t barrier_tmp_event_ = nullptr; - std::atomic_uint32_t refCount_; - std::atomic_uint32_t eventCount_; - std::atomic_uint32_t compute_stream_idx_; - std::atomic_uint32_t transfer_stream_idx_; - unsigned int num_compute_streams_; - unsigned int num_transfer_streams_; - unsigned int last_sync_compute_streams_; - unsigned int last_sync_transfer_streams_; - unsigned int flags_; - ur_queue_flags_t ur_flags_; - // When compute_stream_sync_mutex_ and compute_stream_mutex_ both need to be - // locked at the same time, compute_stream_sync_mutex_ should be locked first + std::vector ComputeAppliedBarrier; + std::vector TransferAppliedBarrier; + ur_context_handle_t Context; + ur_device_handle_t Device; + hipEvent_t BarrierEvent = nullptr; + hipEvent_t BarrierTmpEvent = nullptr; + std::atomic_uint32_t RefCount; + std::atomic_uint32_t EventCount; + std::atomic_uint32_t ComputeStreamIdx; + std::atomic_uint32_t TransferStreamIdx; + unsigned int NumComputeStreams; + unsigned int NumTransferStreams; + unsigned int LastSyncComputeStreams; + unsigned int LastSyncTransferStreams; + unsigned int Flags; + ur_queue_flags_t URFlags; + // When ComputeStreamSyncMutex and ComputeStreamMutex both need to be + // locked at the same time, ComputeStreamSyncMutex should be locked first // to avoid deadlocks - std::mutex compute_stream_sync_mutex_; - std::mutex compute_stream_mutex_; - std::mutex transfer_stream_mutex_; - std::mutex barrier_mutex_; - - ur_queue_handle_t_(std::vector &&compute_streams, - std::vector &&transfer_streams, - ur_context_handle_t context, ur_device_handle_t device, - unsigned int flags, ur_queue_flags_t ur_flags) - : compute_streams_{std::move(compute_streams)}, - transfer_streams_{std::move(transfer_streams)}, - delay_compute_(compute_streams_.size(), false), - compute_applied_barrier_(compute_streams_.size()), - transfer_applied_barrier_(transfer_streams_.size()), context_{context}, - device_{device}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0}, - transfer_stream_idx_{0}, num_compute_streams_{0}, - num_transfer_streams_{0}, last_sync_compute_streams_{0}, - last_sync_transfer_streams_{0}, flags_(flags), ur_flags_(ur_flags) { - urContextRetain(context_); - urDeviceRetain(device_); + std::mutex ComputeStreamSyncMutex; + std::mutex ComputeStreamMutex; + std::mutex TransferStreamMutex; + std::mutex BarrierMutex; + + ur_queue_handle_t_(std::vector &&ComputeStreams, + std::vector &&TransferStreams, + ur_context_handle_t Context, ur_device_handle_t Device, + unsigned int Flags, ur_queue_flags_t URFlags) + : ComputeStreams{std::move(ComputeStreams)}, TransferStreams{std::move( + TransferStreams)}, + DelayCompute(this->ComputeStreams.size(), false), + ComputeAppliedBarrier(this->ComputeStreams.size()), + TransferAppliedBarrier(this->TransferStreams.size()), Context{Context}, + Device{Device}, RefCount{1}, EventCount{0}, ComputeStreamIdx{0}, + TransferStreamIdx{0}, NumComputeStreams{0}, NumTransferStreams{0}, + LastSyncComputeStreams{0}, LastSyncTransferStreams{0}, Flags(Flags), + URFlags(URFlags) { + urContextRetain(Context); + urDeviceRetain(Device); } ~ur_queue_handle_t_() { - urContextRelease(context_); - urDeviceRelease(device_); + urContextRelease(Context); + urDeviceRelease(Device); } - void compute_stream_wait_for_barrier_if_needed(hipStream_t stream, - uint32_t stream_i); - void transfer_stream_wait_for_barrier_if_needed(hipStream_t stream, - uint32_t stream_i); + void computeStreamWaitForBarrierIfNeeded(hipStream_t Stream, + uint32_t Stream_i); + void transferStreamWaitForBarrierIfNeeded(hipStream_t Stream, + uint32_t Stream_i); - // get_next_compute/transfer_stream() functions return streams from + // getNextCompute/TransferStream() functions return streams from // appropriate pools in round-robin fashion - native_type get_next_compute_stream(uint32_t *stream_token = nullptr); - // this overload tries select a stream that was used by one of dependancies. + native_type getNextComputeStream(uint32_t *StreamToken = nullptr); + // this overload tries select a stream that was used by one of dependencies. // If that is not possible returns a new stream. If a stream is reused it // returns a lock that needs to remain locked as long as the stream is in use - native_type get_next_compute_stream(uint32_t num_events_in_wait_list, - const ur_event_handle_t *event_wait_list, - ur_stream_quard &guard, - uint32_t *stream_token = nullptr); - native_type get_next_transfer_stream(); - native_type get() { return get_next_compute_stream(); }; - - bool has_been_synchronized(uint32_t stream_token) { + native_type getNextComputeStream(uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, + ur_stream_quard &Guard, + uint32_t *StreamToken = nullptr); + native_type getNextTransferStream(); + native_type get() { return getNextComputeStream(); }; + + bool hasBeenSynchronized(uint32_t StreamToken) { // stream token not associated with one of the compute streams - if (stream_token == std::numeric_limits::max()) { + if (StreamToken == std::numeric_limits::max()) { return false; } - return last_sync_compute_streams_ > stream_token; + return LastSyncComputeStreams > StreamToken; } - bool can_reuse_stream(uint32_t stream_token) { + bool canReuseStream(uint32_t StreamToken) { // stream token not associated with one of the compute streams - if (stream_token == std::numeric_limits::max()) { + if (StreamToken == std::numeric_limits::max()) { return false; } // If the command represented by the stream token was not the last command // enqueued to the stream we can not reuse the stream - we need to allow for // commands enqueued after it and the one we are about to enqueue to run // concurrently - bool is_last_command = - (compute_stream_idx_ - stream_token) <= compute_streams_.size(); + bool IsLastCommand = + (ComputeStreamIdx - StreamToken) <= ComputeStreams.size(); // If there was a barrier enqueued to the queue after the command // represented by the stream token we should not reuse the stream, as we can // not take that stream into account for the bookkeeping for the next @@ -117,127 +117,122 @@ struct ur_queue_handle_t_ { // represented by the stream token is guaranteed to be complete by the // barrier before any work we are about to enqueue to the stream will start, // so the event does not need to be synchronized with. - return is_last_command && !has_been_synchronized(stream_token); + return IsLastCommand && !hasBeenSynchronized(StreamToken); } - template bool all_of(T &&f) { + template bool allOf(T &&F) { { - std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int end = - std::min(static_cast(compute_streams_.size()), - num_compute_streams_); - if (!std::all_of(compute_streams_.begin(), compute_streams_.begin() + end, - f)) + std::lock_guard ComputeGuard(ComputeStreamMutex); + unsigned int End = std::min( + static_cast(ComputeStreams.size()), NumComputeStreams); + if (!std::all_of(ComputeStreams.begin(), ComputeStreams.begin() + End, F)) return false; } { - std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int end = - std::min(static_cast(transfer_streams_.size()), - num_transfer_streams_); - if (!std::all_of(transfer_streams_.begin(), - transfer_streams_.begin() + end, f)) + std::lock_guard TransferGuard(TransferStreamMutex); + unsigned int End = + std::min(static_cast(TransferStreams.size()), + NumTransferStreams); + if (!std::all_of(TransferStreams.begin(), TransferStreams.begin() + End, + F)) return false; } return true; } - template void for_each_stream(T &&f) { + template void forEachStream(T &&F) { { - std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int end = - std::min(static_cast(compute_streams_.size()), - num_compute_streams_); - for (unsigned int i = 0; i < end; i++) { - f(compute_streams_[i]); + std::lock_guard ComputeGuard(ComputeStreamMutex); + unsigned int End = std::min( + static_cast(ComputeStreams.size()), NumComputeStreams); + for (unsigned int i = 0; i < End; i++) { + F(ComputeStreams[i]); } } { - std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int end = - std::min(static_cast(transfer_streams_.size()), - num_transfer_streams_); - for (unsigned int i = 0; i < end; i++) { - f(transfer_streams_[i]); + std::lock_guard TransferGuard(TransferStreamMutex); + unsigned int End = + std::min(static_cast(TransferStreams.size()), + NumTransferStreams); + for (unsigned int i = 0; i < End; i++) { + F(TransferStreams[i]); } } } - template void sync_streams(T &&f) { - auto sync_compute = [&f, &streams = compute_streams_, - &delay = delay_compute_](unsigned int start, - unsigned int stop) { - for (unsigned int i = start; i < stop; i++) { - f(streams[i]); - delay[i] = false; + template void syncStreams(T &&F) { + auto SyncCompute = [&F, &Streams = ComputeStreams, &Delay = DelayCompute]( + unsigned int Start, unsigned int Stop) { + for (unsigned int i = Start; i < Stop; i++) { + F(Streams[i]); + Delay[i] = false; } }; - auto sync_transfer = [&f, &streams = transfer_streams_](unsigned int start, - unsigned int stop) { - for (unsigned int i = start; i < stop; i++) { - f(streams[i]); + auto SyncTransfer = [&F, &Streams = TransferStreams](unsigned int Start, + unsigned int Stop) { + for (unsigned int i = Start; i < Stop; i++) { + F(Streams[i]); } }; { - unsigned int size = static_cast(compute_streams_.size()); - std::lock_guard compute_sync_guard(compute_stream_sync_mutex_); - std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int start = last_sync_compute_streams_; - unsigned int end = num_compute_streams_ < size - ? num_compute_streams_ - : compute_stream_idx_.load(); - if (end - start >= size) { - sync_compute(0, size); + unsigned int Size = static_cast(ComputeStreams.size()); + std::lock_guard ComputeSyncGuard(ComputeStreamSyncMutex); + std::lock_guard ComputeGuard(ComputeStreamMutex); + unsigned int Start = LastSyncComputeStreams; + unsigned int End = NumComputeStreams < Size ? NumComputeStreams + : ComputeStreamIdx.load(); + if (End - Start >= Size) { + SyncCompute(0, Size); } else { - start %= size; - end %= size; - if (start < end) { - sync_compute(start, end); + Start %= Size; + End %= Size; + if (Start < End) { + SyncCompute(Start, End); } else { - sync_compute(start, size); - sync_compute(0, end); + SyncCompute(Start, Size); + SyncCompute(0, End); } } if (ResetUsed) { - last_sync_compute_streams_ = end; + LastSyncComputeStreams = End; } } { - unsigned int size = static_cast(transfer_streams_.size()); - if (size > 0) { - std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int start = last_sync_transfer_streams_; - unsigned int end = num_transfer_streams_ < size - ? num_transfer_streams_ - : transfer_stream_idx_.load(); - if (end - start >= size) { - sync_transfer(0, size); + unsigned int Size = static_cast(TransferStreams.size()); + if (!Size) { + return; + } + std::lock_guard TransferGuard(TransferStreamMutex); + unsigned int Start = LastSyncTransferStreams; + unsigned int End = NumTransferStreams < Size ? NumTransferStreams + : TransferStreamIdx.load(); + if (End - Start >= Size) { + SyncTransfer(0, Size); + } else { + Start %= Size; + End %= Size; + if (Start < End) { + SyncTransfer(Start, End); } else { - start %= size; - end %= size; - if (start < end) { - sync_transfer(start, end); - } else { - sync_transfer(start, size); - sync_transfer(0, end); - } - } - if (ResetUsed) { - last_sync_transfer_streams_ = end; + SyncTransfer(Start, Size); + SyncTransfer(0, End); } } + if (ResetUsed) { + LastSyncTransferStreams = End; + } } } - ur_context_handle_t get_context() const { return context_; }; + ur_context_handle_t getContext() const { return Context; }; - ur_device_handle_t get_device() const { return device_; }; + ur_device_handle_t getDevice() const { return Device; }; - uint32_t increment_reference_count() noexcept { return ++refCount_; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - uint32_t decrement_reference_count() noexcept { return --refCount_; } + uint32_t decrementReferenceCount() noexcept { return --RefCount; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } - uint32_t get_next_event_id() noexcept { return ++eventCount_; } + uint32_t getNextEventId() noexcept { return ++EventCount; } }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp index 151400c4a6128..9cc1520d13ade 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp @@ -12,20 +12,20 @@ ur_result_t urSamplerCreate(ur_context_handle_t hContext, const ur_sampler_desc_t *pDesc, ur_sampler_handle_t *phSampler) { - std::unique_ptr retImplSampl{ + std::unique_ptr RetImplSampl{ new ur_sampler_handle_t_(hContext)}; if (pDesc && pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) { - retImplSampl->props_ |= pDesc->normalizedCoords; - retImplSampl->props_ |= (pDesc->filterMode << 1); - retImplSampl->props_ |= (pDesc->addressingMode << 2); + RetImplSampl->Props |= pDesc->normalizedCoords; + RetImplSampl->Props |= pDesc->filterMode << 1; + RetImplSampl->Props |= pDesc->addressingMode << 2; } else { // Set default values - retImplSampl->props_ |= true; // Normalized Coords - retImplSampl->props_ |= UR_SAMPLER_ADDRESSING_MODE_CLAMP << 2; + RetImplSampl->Props |= true; // Normalized Coords + RetImplSampl->Props |= UR_SAMPLER_ADDRESSING_MODE_CLAMP << 2; } - *phSampler = retImplSampl.release(); + *phSampler = RetImplSampl.release(); return UR_RESULT_SUCCESS; } @@ -37,22 +37,22 @@ ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, switch (propName) { case UR_SAMPLER_INFO_REFERENCE_COUNT: - return ReturnValue(hSampler->get_reference_count()); + return ReturnValue(hSampler->getReferenceCount()); case UR_SAMPLER_INFO_CONTEXT: - return ReturnValue(hSampler->context_); + return ReturnValue(hSampler->Context); case UR_SAMPLER_INFO_NORMALIZED_COORDS: { - bool norm_coords_prop = static_cast(hSampler->props_); - return ReturnValue(norm_coords_prop); + bool NormCoordsProp = static_cast(hSampler->Props); + return ReturnValue(NormCoordsProp); } case UR_SAMPLER_INFO_FILTER_MODE: { - auto filter_prop = - static_cast(((hSampler->props_ >> 1) & 0x1)); - return ReturnValue(filter_prop); + auto FilterProp = + static_cast((hSampler->Props >> 1) & 0x1); + return ReturnValue(FilterProp); } case UR_SAMPLER_INFO_ADDRESSING_MODE: { - auto addressing_prop = - static_cast(hSampler->props_ >> 2); - return ReturnValue(addressing_prop); + auto AddressingProp = + static_cast(hSampler->Props >> 2); + return ReturnValue(AddressingProp); } default: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; @@ -62,7 +62,7 @@ ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, ur_result_t urSamplerRetain(ur_sampler_handle_t hSampler) { UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - hSampler->increment_reference_count(); + hSampler->incrementReferenceCount(); return UR_RESULT_SUCCESS; } @@ -72,11 +72,11 @@ ur_result_t urSamplerRelease(ur_sampler_handle_t hSampler) { // double delete or someone is messing with the ref count. // either way, cannot safely proceed. sycl::detail::ur::assertion( - hSampler->get_reference_count() != 0, + hSampler->getReferenceCount() != 0, "Reference count overflow detected in urSamplerRelease."); // decrement ref count. If it is 0, delete the sampler. - if (hSampler->decrement_reference_count() == 0) { + if (hSampler->decrementReferenceCount() == 0) { delete hSampler; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.hpp index 6b60092292ed2..b1c98f0171741 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.hpp @@ -16,16 +16,16 @@ /// | 31 30 ... 6 5 | 4 3 2 | 1 | 0 | /// | N/A | addressing mode | fiter mode | normalize coords | struct ur_sampler_handle_t_ { - std::atomic_uint32_t refCount_; - uint32_t props_; - ur_context_handle_t context_; + std::atomic_uint32_t RefCount; + uint32_t Props; + ur_context_handle_t Context; - ur_sampler_handle_t_(ur_context_handle_t context) - : refCount_(1), props_(0), context_(context) {} + ur_sampler_handle_t_(ur_context_handle_t Context) + : RefCount(1), Props(0), Context(Context) {} - uint32_t increment_reference_count() noexcept { return ++refCount_; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - uint32_t decrement_reference_count() noexcept { return --refCount_; } + uint32_t decrementReferenceCount() noexcept { return --RefCount; } - uint32_t get_reference_count() const noexcept { return refCount_; } + uint32_t getReferenceCount() const noexcept { return RefCount; } }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index f0eb6008d8a36..c7258ad241373 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -14,11 +14,11 @@ namespace { // TODO - this is a duplicate of what is in the L0 plugin // We should move this to somewhere common ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) { - if (nullptr == pDdiTable) { + if (pDdiTable == nullptr) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - // Pre 1.0 we enforce loader and adapter must have same version. - // Post 1.0 only major version match should be required. + // Pre 1.0 we enforce that loader and adapter must have the same version. + // Post 1.0 only a major version match should be required. if (version != UR_API_VERSION_CURRENT) { return UR_RESULT_ERROR_UNSUPPORTED_VERSION; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp index 66985fa46988e..2ef88db1b5caa 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp @@ -14,43 +14,41 @@ #include "platform.hpp" /// USM: Implements USM Host allocations using HIP Pinned Memory -/// UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, size_t size, void **ppMem) { UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - size_t device_max_mem_alloc_size = 0; - UR_ASSERT(urDeviceGetInfo(hContext->get_device(), + size_t DeviceMaxMemAllocSize = 0; + UR_ASSERT(urDeviceGetInfo(hContext->getDevice(), UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t), - static_cast(&device_max_mem_alloc_size), + static_cast(&DeviceMaxMemAllocSize), nullptr) == UR_RESULT_SUCCESS, UR_RESULT_ERROR_INVALID_DEVICE); - UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size, + UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize, UR_RESULT_ERROR_INVALID_USM_SIZE); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hContext); - result = UR_CHECK_ERROR(hipHostMalloc(ppMem, size)); - } catch (ur_result_t error) { - result = error; + ScopedContext Active(hContext); + Result = UR_CHECK_ERROR(hipHostMalloc(ppMem, size)); + } catch (ur_result_t Error) { + Result = Error; } UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), UR_RESULT_ERROR_INVALID_VALUE); - assert(result == UR_RESULT_SUCCESS && + assert(Result == UR_RESULT_SUCCESS && (!pUSMDesc || pUSMDesc->align == 0 || reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); - return result; + return Result; } /// USM: Implements USM device allocations using a normal HIP device pointer -/// UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, @@ -59,35 +57,34 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - size_t device_max_mem_alloc_size = 0; + size_t DeviceMaxMemAllocSize = 0; UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t), - static_cast(&device_max_mem_alloc_size), + static_cast(&DeviceMaxMemAllocSize), nullptr) == UR_RESULT_SUCCESS, UR_RESULT_ERROR_INVALID_DEVICE); - UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size, + UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize, UR_RESULT_ERROR_INVALID_USM_SIZE); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hContext); - result = UR_CHECK_ERROR(hipMalloc(ppMem, size)); - } catch (ur_result_t error) { - result = error; + ScopedContext Active(hContext); + Result = UR_CHECK_ERROR(hipMalloc(ppMem, size)); + } catch (ur_result_t Error) { + Result = Error; } UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), UR_RESULT_ERROR_INVALID_VALUE); - assert(result == UR_RESULT_SUCCESS && + assert(Result == UR_RESULT_SUCCESS && (!pUSMDesc || pUSMDesc->align == 0 || reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); - return result; + return Result; } /// USM: Implements USM Shared allocations using HIP Managed Memory -/// UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, @@ -96,59 +93,58 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - size_t device_max_mem_alloc_size = 0; + size_t DeviceMaxMemAllocSize = 0; UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t), - static_cast(&device_max_mem_alloc_size), + static_cast(&DeviceMaxMemAllocSize), nullptr) == UR_RESULT_SUCCESS, UR_RESULT_ERROR_INVALID_DEVICE); - UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size, + UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize, UR_RESULT_ERROR_INVALID_USM_SIZE); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hContext); - result = UR_CHECK_ERROR(hipMallocManaged(ppMem, size, hipMemAttachGlobal)); - } catch (ur_result_t error) { - result = error; + ScopedContext Active(hContext); + Result = UR_CHECK_ERROR(hipMallocManaged(ppMem, size, hipMemAttachGlobal)); + } catch (ur_result_t Error) { + Result = Error; } UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), UR_RESULT_ERROR_INVALID_VALUE); - assert(result == UR_RESULT_SUCCESS && + assert(Result == UR_RESULT_SUCCESS && (!pUSMDesc || pUSMDesc->align == 0 || reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); - return result; + return Result; } /// USM: Frees the given USM pointer associated with the context. -/// UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, void *pMem) { UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hContext); - unsigned int type; + ScopedContext Active(hContext); + unsigned int Type; hipPointerAttribute_t hipPointerAttributeType; - result = + Result = UR_CHECK_ERROR(hipPointerGetAttributes(&hipPointerAttributeType, pMem)); - type = hipPointerAttributeType.memoryType; - UR_ASSERT(type == hipMemoryTypeDevice || type == hipMemoryTypeHost, + Type = hipPointerAttributeType.memoryType; + UR_ASSERT(Type == hipMemoryTypeDevice || Type == hipMemoryTypeHost, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - if (type == hipMemoryTypeDevice) { - result = UR_CHECK_ERROR(hipFree(pMem)); + if (Type == hipMemoryTypeDevice) { + Result = UR_CHECK_ERROR(hipFree(pMem)); } - if (type == hipMemoryTypeHost) { - result = UR_CHECK_ERROR(hipFreeHost(pMem)); + if (Type == hipMemoryTypeHost) { + Result = UR_CHECK_ERROR(hipFreeHost(pMem)); } - } catch (ur_result_t error) { - result = error; + } catch (ur_result_t Error) { + Result = Error; } - return result; + return Result; } UR_APIEXPORT ur_result_t UR_APICALL @@ -158,38 +154,38 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t result = UR_RESULT_SUCCESS; + ur_result_t Result = UR_RESULT_SUCCESS; hipPointerAttribute_t hipPointerAttributeType; UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); try { - ScopedContext active(hContext); + ScopedContext Active(hContext); switch (propName) { case UR_USM_ALLOC_INFO_TYPE: { - unsigned int value; + unsigned int Value; // do not throw if hipPointerGetAttribute returns hipErrorInvalidValue - hipError_t ret = hipPointerGetAttributes(&hipPointerAttributeType, pMem); - if (ret == hipErrorInvalidValue) { + hipError_t Ret = hipPointerGetAttributes(&hipPointerAttributeType, pMem); + if (Ret == hipErrorInvalidValue) { // pointer not known to the HIP subsystem return ReturnValue(UR_USM_TYPE_UNKNOWN); } - result = check_error_ur(ret, __func__, __LINE__ - 5, __FILE__); - value = hipPointerAttributeType.isManaged; - if (value) { + Result = checkErrorUR(Ret, __func__, __LINE__ - 5, __FILE__); + Value = hipPointerAttributeType.isManaged; + if (Value) { // pointer to managed memory return ReturnValue(UR_USM_TYPE_SHARED); } - result = UR_CHECK_ERROR( + Result = UR_CHECK_ERROR( hipPointerGetAttributes(&hipPointerAttributeType, pMem)); - value = hipPointerAttributeType.memoryType; - UR_ASSERT(value == hipMemoryTypeDevice || value == hipMemoryTypeHost, + Value = hipPointerAttributeType.memoryType; + UR_ASSERT(Value == hipMemoryTypeDevice || Value == hipMemoryTypeHost, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - if (value == hipMemoryTypeDevice) { + if (Value == hipMemoryTypeDevice) { // pointer to device memory return ReturnValue(UR_USM_TYPE_DEVICE); } - if (value == hipMemoryTypeHost) { + if (Value == hipMemoryTypeHost) { // pointer to host memory return ReturnValue(UR_USM_TYPE_HOST); } @@ -206,26 +202,26 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, return UR_RESULT_ERROR_INVALID_VALUE; case UR_USM_ALLOC_INFO_DEVICE: { // get device index associated with this pointer - result = UR_CHECK_ERROR( + Result = UR_CHECK_ERROR( hipPointerGetAttributes(&hipPointerAttributeType, pMem)); - int device_idx = hipPointerAttributeType.device; + int DeviceIdx = hipPointerAttributeType.device; // currently each device is in its own platform, so find the platform at // the same index - std::vector platforms; - platforms.resize(device_idx + 1); - result = urPlatformGet(device_idx + 1, platforms.data(), nullptr); + std::vector Platforms; + Platforms.resize(DeviceIdx + 1); + Result = urPlatformGet(DeviceIdx + 1, Platforms.data(), nullptr); // get the device from the platform - ur_device_handle_t device = platforms[device_idx]->devices_[0].get(); - return ReturnValue(device); + ur_device_handle_t Device = Platforms[DeviceIdx]->Devices[0].get(); + return ReturnValue(Device); } default: return UR_RESULT_ERROR_INVALID_ENUMERATION; } - } catch (ur_result_t error) { - result = error; + } catch (ur_result_t Error) { + Result = Error; } - return result; + return Result; } From 3a9742434b48e8c31cd456dff4f5f1b8224fbb46 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Wed, 14 Jun 2023 11:12:14 +0100 Subject: [PATCH 21/42] [SYCL][UR][HIP]Add PCI_ADDRESS device info --- .../unified_runtime/ur/adapters/hip/device.cpp | 16 +++++++++++++++- .../unified_runtime/ur/adapters/hip/queue.cpp | 5 +++-- .../unified_runtime/ur/adapters/hip/queue.hpp | 2 +- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index 3e4aab8f1f0aa..3730081d76313 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -825,9 +825,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(false); case UR_DEVICE_INFO_IMAGE_SRGB: return ReturnValue(false); + case UR_DEVICE_INFO_PCI_ADDRESS: { + constexpr size_t AddressBufferSize = 13; + char AddressBuffer[AddressBufferSize]; + sycl::detail::ur::assertion( + hipDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, + hDevice->get()) == hipSuccess); + // A typical PCI address is 12 bytes + \0: "1234:67:90.2", but the HIP API + // is not guaranteed to use this format. In practice, it uses this format, + // at least in 5.3-5.5. To be on the safe side, we make sure the terminating + // \0 is set. + AddressBuffer[AddressBufferSize - 1] = '\0'; + sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) > 0); + return ReturnValue(AddressBuffer, + strnlen(AddressBuffer, AddressBufferSize - 1) + 1); + } // TODO: Investigate if this information is available on HIP. - case UR_DEVICE_INFO_PCI_ADDRESS: case UR_DEVICE_INFO_GPU_EU_COUNT: case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH: case UR_DEVICE_INFO_GPU_EU_SLICES: diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp index 3c0422afac2ce..55d662cbf75be 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp @@ -253,8 +253,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { /// \param[out] phNativeQueue Set to the native handle of the UR queue object. /// /// \return UR_RESULT_SUCCESS -UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( - ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, ur_native_handle_t *phNativeQueue) { +UR_APIEXPORT ur_result_t UR_APICALL +urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, + ur_native_handle_t *phNativeQueue) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp index 2378f7b3d8315..d8a5e7d81bee5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp @@ -176,7 +176,7 @@ struct ur_queue_handle_t_ { }; { unsigned int Size = static_cast(ComputeStreams.size()); - std::lock_guard ComputeSyncGuard(ComputeStreamSyncMutex); + std::lock_guard ComputeSyncGuard(ComputeStreamSyncMutex); std::lock_guard ComputeGuard(ComputeStreamMutex); unsigned int Start = LastSyncComputeStreams; unsigned int End = NumComputeStreams < Size ? NumComputeStreams From ed5ca282030050d9055d11613c72f306d80b8191 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Wed, 14 Jun 2023 11:28:49 +0100 Subject: [PATCH 22/42] [SYCL][UR][HIP]fix some build warnings --- sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp | 1 + sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp | 7 +++++++ sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp | 1 + 3 files changed, 9 insertions(+) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp index 16f162ff35031..6710ec4e1e206 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp @@ -17,6 +17,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, const ur_context_properties_t *pProperties, ur_context_handle_t *phContext) { + std::ignore = DeviceCount; std::ignore = pProperties; UR_ASSERT(phDevices, UR_RESULT_ERROR_INVALID_NULL_POINTER); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp index a35a9b0200321..9aa4805332778 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp @@ -103,6 +103,8 @@ urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, ur_program_handle_t hProgram, const char *pOptions) { + std::ignore = hContext; + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); ur_result_t Result = UR_RESULT_SUCCESS; @@ -138,6 +140,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( ur_native_handle_t hNativeProgram, ur_context_handle_t hContext, const ur_program_native_properties_t *pProperties, ur_program_handle_t *phProgram) { + std::ignore = hNativeProgram; + std::ignore = hContext; + std::ignore = pProperties; + std::ignore = phProgram; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp index 55d662cbf75be..5040423b6d526 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp @@ -256,6 +256,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, ur_native_handle_t *phNativeQueue) { + std::ignore = pDesc; UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); From 52a0c10e08c2e01c8920a1bab3a51d12cae4cfdd Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Wed, 14 Jun 2023 11:38:52 +0100 Subject: [PATCH 23/42] [SYCL][UR][HIP]Change UR_DLLEXPORT to UR_APIEXPORT and version ur_hip --- sycl/plugins/unified_runtime/CMakeLists.txt | 5 +++++ .../plugins/unified_runtime/ur/adapters/hip/device.cpp | 2 +- .../unified_runtime/ur/adapters/hip/platform.cpp | 10 +++++----- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index f93801906f7ab..336b871cd960e 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -207,6 +207,11 @@ if ("hip" IN_LIST SYCL_ENABLE_PLUGINS) Threads::Threads ) + set_target_properties("ur_adapter_hip" PROPERTIES + VERSION "0.0.0" + SOVERSION "0" + ) + if("${SYCL_BUILD_PI_HIP_PLATFORM}" STREQUAL "AMD") target_link_libraries(ur_adapter_hip PUBLIC rocmdrv) # Set HIP define to select AMD platform diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index 3730081d76313..154b95022fc2c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -874,7 +874,7 @@ urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *, /// \return UR_RESULT_SUCCESS always since HIP devices are always root /// devices. -UR_DLLEXPORT ur_result_t UR_APICALL +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t hDevice) { UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp index f9885446ef43a..2f49a6d7c3a04 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp @@ -10,7 +10,7 @@ hipEvent_t ur_platform_handle_t_::EvBase{nullptr}; -UR_DLLEXPORT ur_result_t UR_APICALL +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName, size_t propSize, void *pPropValue, size_t *pSizeRet) { @@ -48,7 +48,7 @@ urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName, /// /// However because multiple devices in a context is not currently supported, /// place each device in a separate platform. -UR_DLLEXPORT ur_result_t UR_APICALL +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) { @@ -122,7 +122,7 @@ urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms, } } -UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( ur_platform_handle_t hDriver, ur_api_version_t *pVersion) { UR_ASSERT(hDriver, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pVersion, UR_RESULT_ERROR_INVALID_NULL_POINTER); @@ -131,11 +131,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( return UR_RESULT_SUCCESS; } -UR_DLLEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t) { +UR_APIEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t) { return UR_RESULT_SUCCESS; } -UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) { +UR_APIEXPORT ur_result_t UR_APICALL urTearDown(void *) { return UR_RESULT_SUCCESS; } From 43e8ed4cf17348aec13a95528968de4842c9f3ea Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Wed, 14 Jun 2023 16:00:59 +0100 Subject: [PATCH 24/42] [SYCL][UR][HIP] Small fixes for unused parameters and formating --- .../unified_runtime/ur/adapters/hip/event.cpp | 8 +++---- .../ur/adapters/hip/program.cpp | 4 ++-- .../unified_runtime/ur/adapters/hip/queue.hpp | 4 ++-- .../unified_runtime/ur/adapters/hip/usm.cpp | 22 +++++++++---------- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp index c75a4cf52db7c..56e379071e40c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp @@ -15,10 +15,10 @@ ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type, ur_context_handle_t Context, ur_queue_handle_t Queue, hipStream_t Stream, uint32_t StreamToken) - : CommandType{Type}, RefCount{1}, HasBeenWaitedOn{false}, - IsRecorded{false}, IsStarted{false}, - StreamToken{StreamToken}, EventEnd{nullptr}, EvStart{nullptr}, - EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} { + : CommandType{Type}, RefCount{1}, HasBeenWaitedOn{false}, IsRecorded{false}, + IsStarted{false}, StreamToken{StreamToken}, EventEnd{nullptr}, + EvStart{nullptr}, EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, + Context{Context} { bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp index 9aa4805332778..144b9bc2cef77 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp @@ -9,8 +9,8 @@ #include "program.hpp" ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt) - : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, Context{ - Ctxt} { + : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, + Context{Ctxt} { urContextRetain(Context); } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp index d8a5e7d81bee5..ac8aeaf37c373 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.hpp @@ -54,8 +54,8 @@ struct ur_queue_handle_t_ { std::vector &&TransferStreams, ur_context_handle_t Context, ur_device_handle_t Device, unsigned int Flags, ur_queue_flags_t URFlags) - : ComputeStreams{std::move(ComputeStreams)}, TransferStreams{std::move( - TransferStreams)}, + : ComputeStreams{std::move(ComputeStreams)}, + TransferStreams{std::move(TransferStreams)}, DelayCompute(this->ComputeStreams.size(), false), ComputeAppliedBarrier(this->ComputeStreams.size()), TransferAppliedBarrier(this->TransferStreams.size()), Context{Context}, diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp index 2ef88db1b5caa..8849dd59d23b1 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp @@ -14,9 +14,9 @@ #include "platform.hpp" /// USM: Implements USM Host allocations using HIP Pinned Memory -UR_APIEXPORT ur_result_t UR_APICALL -urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, - ur_usm_pool_handle_t pool, size_t size, void **ppMem) { +UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( + ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, + [[maybe_unused]] ur_usm_pool_handle_t pool, size_t size, void **ppMem) { UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); @@ -49,10 +49,10 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, } /// USM: Implements USM device allocations using a normal HIP device pointer -UR_APIEXPORT ur_result_t UR_APICALL -urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, - size_t size, void **ppMem) { +UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool, + size_t size, void **ppMem) { UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); @@ -85,10 +85,10 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, } /// USM: Implements USM Shared allocations using HIP Managed Memory -UR_APIEXPORT ur_result_t UR_APICALL -urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, - size_t size, void **ppMem) { +UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool, + size_t size, void **ppMem) { UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); From c0bfd2ef8adb8dac7cc522e03df9739b099e5c1e Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Thu, 15 Jun 2023 11:30:56 +0100 Subject: [PATCH 25/42] [SYCL][UR][HIP] Fix a few issues in the hip adapter revealed by UR enqueue CTS. --- .../ur/adapters/hip/device.cpp | 3 +- .../ur/adapters/hip/enqueue.cpp | 116 +++++++++++------- .../ur/adapters/hip/memory.hpp | 15 ++- 3 files changed, 85 insertions(+), 49 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index 154b95022fc2c..33dbb1ede8136 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -840,7 +840,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(AddressBuffer, strnlen(AddressBuffer, AddressBufferSize - 1) + 1); } - + case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: + return ReturnValue(false); // TODO: Investigate if this information is available on HIP. case UR_DEVICE_INFO_GPU_EU_COUNT: case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH: diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp index 6cf7e169d364b..749cdaad45dff 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp @@ -622,6 +622,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(size + srcOffset <= hBufferSrc->Mem.BufferMem.getSize(), + UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT(size + dstOffset <= hBufferDst->Mem.BufferMem.getSize(), + UR_RESULT_ERROR_INVALID_SIZE); std::unique_ptr RetImplEvent{nullptr}; @@ -706,15 +710,57 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( return Result; } +// HIP has no memset functions that allow setting values more than 4 bytes. UR +// API lets you pass an arbitrary "pattern" to the buffer fill, which can be +// more than 4 bytes. We must break up the pattern into 1 byte values, and set +// the buffer using multiple strided calls. The first 4 patterns are set using +// hipMemsetD32Async then all subsequent 1 byte patterns are set using +// hipMemset2DAsync which is called for each pattern. +ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize, + size_t Size, const void *pPattern, + hipDeviceptr_t Ptr) { + // Calculate the number of patterns, stride, number of times the pattern + // needs to be applied, and the number of times the first 32 bit pattern + // needs to be applied. + auto NumberOfSteps = PatternSize / sizeof(uint8_t); + auto Pitch = NumberOfSteps * sizeof(uint8_t); + auto Height = Size / NumberOfSteps; + auto Count32 = Size / sizeof(uint32_t); + + // Get 4-byte chunk of the pattern and call hipMemsetD32Async + auto Value = *(static_cast(pPattern)); + auto Result = UR_CHECK_ERROR(hipMemsetD32Async(Ptr, Value, Count32, Stream)); + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + for (auto step = 4u; step < NumberOfSteps; ++step) { + // take 1 byte of the pattern + Value = *(static_cast(pPattern) + step); + + // offset the pointer to the part of the buffer we want to write to + auto OffsetPtr = reinterpret_cast(reinterpret_cast(Ptr) + + (step * sizeof(uint8_t))); + + // set all of the pattern chunks + Result = UR_CHECK_ERROR(hipMemset2DAsync(OffsetPtr, Pitch, Value, + sizeof(uint8_t), Height, Stream)); + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, const void *pPattern, size_t patternSize, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pPattern, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(size + offset <= hBuffer->Mem.BufferMem.getSize(), + UR_RESULT_ERROR_INVALID_SIZE); auto ArgsAreMultiplesOfPatternSize = (offset % patternSize == 0) || (size % patternSize == 0); @@ -773,38 +819,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( } default: { - // HIP has no memset functions that allow setting values more than 4 - // bytes. UR API lets you pass an arbitrary "pattern" to the buffer - // fill, which can be more than 4 bytes. We must break up the pattern - // into 1 byte values, and set the buffer using multiple strided calls. - // The first 4 patterns are set using hipMemsetD32Async then all - // subsequent 1 byte patterns are set using hipMemset2DAsync which is - // called for each pattern. - - // Calculate the number of patterns, stride, number of times the pattern - // needs to be applied, and the number of times the first 32 bit pattern - // needs to be applied. - auto NumberOfSteps = patternSize / sizeof(uint8_t); - auto Pitch = NumberOfSteps * sizeof(uint8_t); - auto Height = size / NumberOfSteps; - auto Count32 = size / sizeof(uint32_t); - - // Get 4-byte chunk of the pattern and call hipMemsetD32Async - auto Value = *(static_cast(pPattern)); - Result = - UR_CHECK_ERROR(hipMemsetD32Async(DstDevice, Value, Count32, Stream)); - for (auto step = 4u; step < NumberOfSteps; ++step) { - // take 1 byte of the pattern - Value = *(static_cast(pPattern) + step); - - // offset the pointer to the part of the buffer we want to write to - auto OffsetPtr = reinterpret_cast( - reinterpret_cast(DstDevice) + (step * sizeof(uint8_t))); - - // set all of the pattern chunks - Result = UR_CHECK_ERROR(hipMemset2DAsync( - OffsetPtr, Pitch, Value, sizeof(uint8_t), Height, Stream)); - } + Result = commonMemSetLargePattern(Stream, patternSize, size, pPattern, + DstDevice); break; } } @@ -1123,6 +1139,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( UR_ASSERT(ppRetMap, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer, UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.getSize(), + UR_RESULT_ERROR_INVALID_SIZE); ur_result_t Result = UR_RESULT_ERROR_INVALID_OPERATION; const bool IsPinned = @@ -1131,11 +1149,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( // Currently no support for overlapping regions if (hBuffer->Mem.BufferMem.getMapPtr() != nullptr) { - return Result; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } // Allocate a pointer in the host to store the mapped information - auto HostPtr = hBuffer->Mem.BufferMem.mapToPtr(offset, mapFlags); + auto HostPtr = hBuffer->Mem.BufferMem.mapToPtr(size, offset, mapFlags); *ppRetMap = hBuffer->Mem.BufferMem.getMapPtr(); if (HostPtr) { Result = UR_RESULT_SUCCESS; @@ -1199,8 +1217,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) { // Pinned host memory is only on host so it doesn't need to be written to. Result = urEnqueueMemBufferWrite( - hQueue, hMem, true, hMem->Mem.BufferMem.getMapOffset(pMappedPtr), - hMem->Mem.BufferMem.getSize(), pMappedPtr, numEventsInWaitList, + hQueue, hMem, true, hMem->Mem.BufferMem.getMapOffset(), + hMem->Mem.BufferMem.getMapSize(), pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent); } else { ScopedContext Active(hQueue->getContext()); @@ -1252,25 +1270,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( UR_COMMAND_USM_FILL, hQueue, HIPStream, StreamToken)); EventPtr->start(); } + + auto N = size / patternSize; switch (patternSize) { case 1: Result = UR_CHECK_ERROR( hipMemsetD8Async(reinterpret_cast(ptr), - *(const uint8_t *)pPattern & 0xFF, size, HIPStream)); + *(const uint8_t *)pPattern & 0xFF, N, HIPStream)); break; case 2: Result = UR_CHECK_ERROR(hipMemsetD16Async( reinterpret_cast(ptr), - *(const uint16_t *)pPattern & 0xFFFF, size, HIPStream)); + *(const uint16_t *)pPattern & 0xFFFF, N, HIPStream)); break; case 4: Result = UR_CHECK_ERROR(hipMemsetD32Async( reinterpret_cast(ptr), - *(const uint32_t *)pPattern & 0xFFFFFFFF, size, HIPStream)); + *(const uint32_t *)pPattern & 0xFFFFFFFF, N, HIPStream)); break; default: - return UR_RESULT_ERROR_INVALID_ARGUMENT; + Result = commonMemSetLargePattern(HIPStream, patternSize, size, pPattern, + reinterpret_cast(ptr)); + break; } if (phEvent) { @@ -1328,9 +1350,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_queue_handle_t hQueue, const void *pMem, size_t size, ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + unsigned int PointerRangeSize = 0; + UR_CHECK_ERROR(hipPointerGetAttribute(&PointerRangeSize, + HIP_POINTER_ATTRIBUTE_RANGE_SIZE, + (hipDeviceptr_t)pMem)); + UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); // flags is currently unused so fail if set if (flags != 0) @@ -1365,11 +1391,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) { - std::ignore = size; std::ignore = advice; UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + unsigned int PointerRangeSize = 0; + UR_CHECK_ERROR(hipPointerGetAttribute(&PointerRangeSize, + HIP_POINTER_ATTRIBUTE_RANGE_SIZE, + (hipDeviceptr_t)pMem)); + UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); // TODO implement a mapping to hipMemAdvise once the expected behaviour // of urEnqueueUSMAdvise is detailed in the USM extension diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp index 95439609070e2..ad1d62641f39a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp @@ -50,6 +50,8 @@ struct ur_mem_handle_t_ { void *HostPtr; /// Size of the allocation in bytes size_t Size; + /// Size of the active mapped region. + size_t MapSize; /// Offset of the active mapped region. size_t MapOffset; /// Pointer to the active mapped region, if any @@ -84,17 +86,18 @@ struct ur_mem_handle_t_ { void *getMapPtr() const noexcept { return MapPtr; } - size_t getMapOffset(void *Ptr) const noexcept { - std::ignore = Ptr; - return MapOffset; - } + size_t getMapSize() const noexcept { return MapSize; } + + size_t getMapOffset() const noexcept { return MapOffset; } /// Returns a pointer to data visible on the host that contains /// the data on the device associated with this allocation. /// The offset is used to index into the HIP allocation. /// - void *mapToPtr(size_t Offset, ur_map_flags_t Flags) noexcept { + void *mapToPtr(size_t Size, size_t Offset, + ur_map_flags_t Flags) noexcept { assert(MapPtr == nullptr); + MapSize = Size; MapOffset = Offset; MapFlags = Flags; if (HostPtr) { @@ -115,6 +118,7 @@ struct ur_mem_handle_t_ { free(MapPtr); } MapPtr = nullptr; + MapSize = 0; MapOffset = 0; } @@ -147,6 +151,7 @@ struct ur_mem_handle_t_ { Mem.BufferMem.Parent = Parent; Mem.BufferMem.HostPtr = HostPtr; Mem.BufferMem.Size = Size; + Mem.BufferMem.MapSize = 0; Mem.BufferMem.MapOffset = 0; Mem.BufferMem.MapPtr = nullptr; Mem.BufferMem.MapFlags = UR_MAP_FLAG_WRITE; From 4ef73efc2aa5066c7b8f6df193d2d43442b28cbd Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Mon, 19 Jun 2023 14:30:08 +0100 Subject: [PATCH 26/42] [SYCL][UR][HIP] Mirror SYCL_PI env vars with UR prefix --- .../ur/adapters/hip/common.cpp | 6 +++-- .../ur/adapters/hip/enqueue.cpp | 25 ++++++++++++------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp index 071905d3614e3..ebc3bdd02033e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp @@ -34,7 +34,8 @@ ur_result_t checkErrorUR(hipError_t Result, const char *Function, int Line, return UR_RESULT_SUCCESS; } - if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) { + if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr || + std::getenv("UR_SUPPRESS_ERROR_MESSAGE") == nullptr) { const char *ErrorString = nullptr; const char *ErrorName = nullptr; ErrorName = hipGetErrorName(Result); @@ -50,7 +51,8 @@ ur_result_t checkErrorUR(hipError_t Result, const char *Function, int Line, std::cerr << SS.str(); } - if (std::getenv("PI_HIP_ABORT") != nullptr) { + if (std::getenv("PI_HIP_ABORT") != nullptr || + std::getenv("UR_HIP_ABORT") != nullptr) { std::abort(); } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp index 749cdaad45dff..c29e515942129 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp @@ -307,24 +307,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } // Set local mem max size if env var is present - static const char *local_mem_sz_ptr = + static const char *LocalMemSzPtrUR = + std::getenv("UR_HIP_MAX_LOCAL_MEM_SIZE"); + static const char *LocalMemSzPtrPI = std::getenv("SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE"); + static const char *LocalMemSzPtr = + LocalMemSzPtrUR ? LocalMemSzPtrUR + : (LocalMemSzPtrPI ? LocalMemSzPtrPI : nullptr); - if (local_mem_sz_ptr) { - int device_max_local_mem = 0; + if (LocalMemSzPtr) { + int DeviceMaxLocalMem = 0; Result = UR_CHECK_ERROR(hipDeviceGetAttribute( - &device_max_local_mem, hipDeviceAttributeMaxSharedMemoryPerBlock, + &DeviceMaxLocalMem, hipDeviceAttributeMaxSharedMemoryPerBlock, hQueue->getDevice()->get())); - static const int env_val = std::atoi(local_mem_sz_ptr); - if (env_val <= 0 || env_val > device_max_local_mem) { - setErrorMessage("Invalid value specified for " - "SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE", + static const int EnvVal = std::atoi(LocalMemSzPtr); + if (EnvVal <= 0 || EnvVal > DeviceMaxLocalMem) { + setErrorMessage(LocalMemSzPtrUR ? "Invalid value specified for " + "UR_HIP_MAX_LOCAL_MEM_SIZE" + : "Invalid value specified for " + "SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE", UR_RESULT_ERROR_ADAPTER_SPECIFIC); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } Result = UR_CHECK_ERROR(hipFuncSetAttribute( - HIPFunc, hipFuncAttributeMaxDynamicSharedMemorySize, env_val)); + HIPFunc, hipFuncAttributeMaxDynamicSharedMemorySize, EnvVal)); } Result = UR_CHECK_ERROR(hipModuleLaunchKernel( From 922894e69405ca0a843f77439cf504d7d719cf75 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Fri, 16 Jun 2023 11:33:59 +0100 Subject: [PATCH 27/42] [sycl][hip][ur] remove validation checks from inside the adapter --- .../ur/adapters/hip/context.cpp | 16 +---- .../ur/adapters/hip/device.cpp | 20 ++---- .../ur/adapters/hip/enqueue.cpp | 68 ------------------- .../unified_runtime/ur/adapters/hip/event.cpp | 10 --- .../ur/adapters/hip/kernel.cpp | 22 ------ .../ur/adapters/hip/memory.cpp | 12 ---- .../ur/adapters/hip/platform.cpp | 8 +-- .../ur/adapters/hip/program.cpp | 18 ----- .../unified_runtime/ur/adapters/hip/queue.cpp | 18 +---- .../ur/adapters/hip/sampler.cpp | 4 -- .../unified_runtime/ur/adapters/hip/usm.cpp | 15 ---- 11 files changed, 9 insertions(+), 202 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp index 6710ec4e1e206..f08cd8cb2c43c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp @@ -20,9 +20,6 @@ urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, std::ignore = DeviceCount; std::ignore = pProperties; - UR_ASSERT(phDevices, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(phContext, UR_RESULT_ERROR_INVALID_NULL_POINTER); - assert(DeviceCount == 1); ur_result_t RetErr = UR_RESULT_SUCCESS; @@ -41,7 +38,7 @@ urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, static std::once_flag InitFlag; std::call_once( InitFlag, - [](ur_result_t &Err) { + [](ur_result_t &) { // Use default stream to record base event counter UR_CHECK_ERROR(hipEventCreateWithFlags(&ur_platform_handle_t_::EvBase, hipEventDefault)); @@ -69,7 +66,6 @@ urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(ur_context_handle_t hContext, ur_context_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); @@ -106,8 +102,6 @@ urContextGetInfo(ur_context_handle_t hContext, ur_context_info_t propName, UR_APIEXPORT ur_result_t UR_APICALL urContextRelease(ur_context_handle_t hContext) { - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - if (hContext->decrementReferenceCount() > 0) { return UR_RESULT_SUCCESS; } @@ -146,8 +140,6 @@ urContextRelease(ur_context_handle_t hContext) { UR_APIEXPORT ur_result_t UR_APICALL urContextRetain(ur_context_handle_t hContext) { - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - assert(hContext->getReferenceCount() > 0); hContext->incrementReferenceCount(); @@ -156,9 +148,6 @@ urContextRetain(ur_context_handle_t hContext) { UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) { - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(phNativeContext, UR_RESULT_ERROR_INVALID_NULL_POINTER); - *phNativeContext = reinterpret_cast(hContext->get()); return UR_RESULT_SUCCESS; } @@ -180,9 +169,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter( ur_context_handle_t hContext, ur_context_extended_deleter_t pfnDeleter, void *pUserData) { - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pfnDeleter, UR_RESULT_ERROR_INVALID_NULL_POINTER); - hContext->setExtendedDeleter(pfnDeleter, pUserData); return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index 33dbb1ede8136..b239572268e6c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -24,7 +24,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { - UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); static constexpr uint32_t MaxWorkItemDimensions = 3u; @@ -862,8 +861,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, /// \return UR_RESULT_SUCCESS if the function is executed successfully /// HIP devices are always root devices so retain always returns success. UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t hDevice) { - UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - + std::ignore = hDevice; return UR_RESULT_SUCCESS; } @@ -877,8 +875,7 @@ urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *, /// devices. UR_APIEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t hDevice) { - UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - + std::ignore = hDevice; return UR_RESULT_SUCCESS; } @@ -893,8 +890,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, const bool AskingForAll = DeviceType == UR_DEVICE_TYPE_ALL; const bool ReturnDevices = AskingForDefault || AskingForGPU || AskingForAll; - UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - size_t NumDevices = ReturnDevices ? hPlatform->Devices.size() : 0; try { @@ -926,9 +921,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, /// \return UR_RESULT_SUCCESS UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( ur_device_handle_t hDevice, ur_native_handle_t *phNativeHandle) { - UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(phNativeHandle, UR_RESULT_ERROR_INVALID_NULL_POINTER); - *phNativeHandle = reinterpret_cast(hDevice->get()); return UR_RESULT_SUCCESS; } @@ -937,11 +929,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, const ur_device_native_properties_t *pProperties, ur_device_handle_t *phDevice) { + std::ignore = hNativeDevice; std::ignore = hPlatform; std::ignore = pProperties; - - UR_ASSERT(hNativeDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER); + std::ignore = phDevice; return UR_RESULT_ERROR_INVALID_OPERATION; } @@ -954,7 +945,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( // Ignore unused parameter std::ignore = hDevice; - UR_ASSERT(pBinaries, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(NumBinaries > 0, UR_RESULT_ERROR_INVALID_ARGUMENT); // Look for an image for the HIP target, and return the first one that is @@ -980,8 +970,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, uint64_t *pDeviceTimestamp, uint64_t *pHostTimestamp) { - UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - if (!pDeviceTimestamp && !pHostTimestamp) return UR_RESULT_SUCCESS; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp index c29e515942129..8c76803accd8a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp @@ -93,10 +93,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(!(phEventWaitList == NULL && numEventsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), @@ -143,10 +139,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(!(phEventWaitList == NULL && numEventsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), @@ -194,11 +186,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pGlobalWorkOffset, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(pGlobalWorkSize, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hQueue->getContext() == hKernel->getContext(), UR_RESULT_ERROR_INVALID_QUEUE); UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); @@ -373,7 +360,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(!(phEventWaitList == NULL && numEventsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST) UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), @@ -506,10 +492,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( size_t hostRowPitch, size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(!(phEventWaitList == NULL && numEventsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), @@ -579,10 +561,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t Result = UR_RESULT_SUCCESS; void *DevPtr = hBuffer->Mem.BufferMem.getVoid(); std::unique_ptr RetImplEvent{nullptr}; @@ -628,7 +606,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(size + srcOffset <= hBufferSrc->Mem.BufferMem.getSize(), UR_RESULT_ERROR_INVALID_SIZE); UR_ASSERT(size + dstOffset <= hBufferDst->Mem.BufferMem.getSize(), @@ -678,11 +655,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hBufferSrc, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hBufferDst, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t Result = UR_RESULT_SUCCESS; void *SrcPtr = hBufferSrc->Mem.BufferMem.getVoid(); void *DstPtr = hBufferDst->Mem.BufferMem.getVoid(); @@ -763,12 +735,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( size_t patternSize, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pPattern, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(size + offset <= hBuffer->Mem.BufferMem.getSize(), UR_RESULT_ERROR_INVALID_SIZE); - auto ArgsAreMultiplesOfPatternSize = (offset % patternSize == 0) || (size % patternSize == 0); @@ -854,8 +822,6 @@ static ur_result_t commonEnqueueMemImageNDCopy( hipStream_t HipStream, ur_mem_type_t ImgType, const size_t *Region, const void *SrcPtr, const hipMemoryType SrcType, const size_t *SrcOffset, void *DstPtr, const hipMemoryType DstType, const size_t *DstOffset) { - UR_ASSERT(Region, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(SrcType == hipMemoryTypeArray || SrcType == hipMemoryTypeHost, UR_RESULT_ERROR_INVALID_VALUE); UR_ASSERT(DstType == hipMemoryTypeArray || DstType == hipMemoryTypeHost, @@ -927,9 +893,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { std::ignore = rowPitch; std::ignore = slicePitch; - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); @@ -995,9 +958,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( std::ignore = blockingWrite; std::ignore = rowPitch; std::ignore = slicePitch; - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); @@ -1059,10 +1019,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( ur_rect_offset_t dstOrigin, ur_rect_region_t region, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE) - UR_ASSERT(hImageSrc, UR_RESULT_ERROR_INVALID_NULL_HANDLE) - UR_ASSERT(hImageDst, UR_RESULT_ERROR_INVALID_NULL_HANDLE) UR_ASSERT(hImageSrc->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface, @@ -1141,9 +1097,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( ur_map_flags_t mapFlags, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, void **ppRetMap) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(ppRetMap, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer, UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.getSize(), @@ -1204,10 +1157,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { ur_result_t Result = UR_RESULT_SUCCESS; - - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pMappedPtr, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer, UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() != nullptr, @@ -1255,11 +1204,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( ur_queue_handle_t hQueue, void *ptr, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(pPattern, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr EventPtr{nullptr}; @@ -1317,10 +1261,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr EventPtr{nullptr}; @@ -1357,8 +1297,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_queue_handle_t hQueue, const void *pMem, size_t size, ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); unsigned int PointerRangeSize = 0; UR_CHECK_ERROR(hipPointerGetAttribute(&PointerRangeSize, HIP_POINTER_ATTRIBUTE_RANGE_SIZE, @@ -1400,8 +1338,6 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) { std::ignore = advice; - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); unsigned int PointerRangeSize = 0; UR_CHECK_ERROR(hipPointerGetAttribute(&PointerRangeSize, HIP_POINTER_ATTRIBUTE_RANGE_SIZE, @@ -1452,10 +1388,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( const void *pSrc, size_t srcPitch, size_t width, size_t height, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t Result = UR_RESULT_SUCCESS; try { diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp index 56e379071e40c..ee9518454ce3f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp @@ -159,7 +159,6 @@ ur_result_t ur_event_handle_t_::release() { UR_APIEXPORT ur_result_t UR_APICALL urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { UR_ASSERT(numEvents > 0, UR_RESULT_ERROR_INVALID_VALUE); - UR_ASSERT(phEventWaitList, UR_RESULT_ERROR_INVALID_NULL_POINTER); try { @@ -186,7 +185,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, size_t propValueSize, void *pPropValue, size_t *pPropValueSizeRet) { - UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_SIZE); UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); @@ -214,7 +212,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( ur_event_handle_t hEvent, ur_profiling_info_t propName, size_t propValueSize, void *pPropValue, size_t *pPropValueSizeRet) { - UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_VALUE); ur_queue_handle_t Queue = hEvent->getQueue(); @@ -250,8 +247,6 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus, } UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { - UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - const auto RefCount = hEvent->incrementReferenceCount(); sycl::detail::ur::assertion( @@ -261,8 +256,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { } UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { - UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - // double delete or someone is messing with the ref count. // either way, cannot safely proceed. sycl::detail::ur::assertion( @@ -294,9 +287,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { /// a user event. UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( ur_event_handle_t hEvent, ur_native_handle_t *phNativeEvent) { - UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(phNativeEvent, UR_RESULT_ERROR_INVALID_NULL_POINTER); - *phNativeEvent = reinterpret_cast(hEvent->get()); return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp index 0852767c95d05..fb494dd2600da 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp @@ -13,10 +13,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, ur_kernel_handle_t *phKernel) { - UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pKernelName, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(phKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetKernel{nullptr}; @@ -55,8 +51,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, ur_kernel_group_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { @@ -140,7 +134,6 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, } UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) { - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hKernel->getReferenceCount() > 0u, UR_RESULT_ERROR_INVALID_KERNEL); hKernel->incrementReferenceCount(); @@ -149,8 +142,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) { UR_APIEXPORT ur_result_t UR_APICALL urKernelRelease(ur_kernel_handle_t hKernel) { - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - // double delete or someone is messing with the ref count. // either way, cannot safely proceed. UR_ASSERT(hKernel->getReferenceCount() != 0, UR_RESULT_ERROR_INVALID_KERNEL); @@ -178,8 +169,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, const void *pArgValue) { - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t Result = UR_RESULT_SUCCESS; try { if (pArgValue) { @@ -198,8 +187,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, size_t propSize, void *pKernelInfo, size_t *pPropSizeRet) { - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UrReturnHelper ReturnValue(propSize, pKernelInfo, pPropSizeRet); switch (propName) { @@ -226,8 +213,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, ur_kernel_sub_group_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: { @@ -279,9 +264,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( ur_kernel_handle_t hKernel, uint32_t argIndex, ur_mem_handle_t hArgValue) { - - UR_ASSERT(hKernel != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - // Below sets kernel arg when zero-sized buffers are handled. // In such case the corresponding memory is null. if (hArgValue == nullptr) { @@ -320,10 +302,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, ur_sampler_handle_t hArgValue) { - - UR_ASSERT(hKernel != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hArgValue != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t Result = UR_RESULT_SUCCESS; try { uint32_t SamplerProps = hArgValue->Props; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp index 9f13d3b6c7c9f..db7f716393ec2 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp @@ -6,8 +6,6 @@ /// If this is zero, calls the relevant HIP Free function /// \return UR_RESULT_SUCCESS unless deallocation error UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { - UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t Result = UR_RESULT_SUCCESS; try { @@ -74,7 +72,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size, const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) { - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); // Validate flags UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, UR_RESULT_ERROR_INVALID_ENUMERATION); @@ -84,7 +81,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( UR_RESULT_ERROR_INVALID_HOST_PTR); } // Need input memory object - UR_ASSERT(phBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(size != 0, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); // Currently, USE_HOST_PTR is not implemented using host register @@ -163,7 +159,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( ur_mem_handle_t hBuffer, ur_mem_flags_t flags, ur_buffer_create_type_t bufferCreateType, const ur_buffer_region_t *pRegion, ur_mem_handle_t *phMem) { - UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, UR_RESULT_ERROR_INVALID_ENUMERATION); UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); @@ -189,8 +184,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( UR_ASSERT(bufferCreateType == UR_BUFFER_CREATE_TYPE_REGION, UR_RESULT_ERROR_INVALID_ENUMERATION); - UR_ASSERT(pRegion != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(phMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); @@ -241,7 +234,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, void *pMemInfo, size_t *pPropSizeRet) { - UR_ASSERT(hMemory, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(MemInfoType <= UR_MEM_INFO_CONTEXT, UR_RESULT_ERROR_INVALID_ENUMERATION); UR_ASSERT(hMemory->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); @@ -335,9 +327,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( void *pHost, ur_mem_handle_t *phMem) { // Need input memory object - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(phMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(pImageDesc, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, UR_RESULT_ERROR_INVALID_ENUMERATION); if (flags & @@ -520,7 +509,6 @@ urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t ImgInfoType, } UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { - UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hMem->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_MEM_OBJECT); hMem->incrementReferenceCount(); return UR_RESULT_SUCCESS; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp index 2f49a6d7c3a04..130703743b1e1 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp @@ -13,8 +13,6 @@ hipEvent_t ur_platform_handle_t_::EvBase{nullptr}; UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName, size_t propSize, void *pPropValue, size_t *pSizeRet) { - - UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UrReturnHelper ReturnValue(propSize, pPropValue, pSizeRet); switch (propName) { @@ -124,9 +122,7 @@ urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms, UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( ur_platform_handle_t hDriver, ur_api_version_t *pVersion) { - UR_ASSERT(hDriver, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pVersion, UR_RESULT_ERROR_INVALID_NULL_POINTER); - + std::ignore = hDriver; *pVersion = UR_API_VERSION_CURRENT; return UR_RESULT_SUCCESS; } @@ -148,8 +144,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption( const char **ppPlatformOption) { std::ignore = hPlatform; using namespace std::literals; - if (pFrontendOption == nullptr) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; if (pFrontendOption == "-O0"sv || pFrontendOption == "-O1"sv || pFrontendOption == "-O2"sv || pFrontendOption == "-O3"sv || pFrontendOption == ""sv) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp index 144b9bc2cef77..ac6d92b1b45f2 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp @@ -78,8 +78,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, size_t length, const ur_program_properties_t *pProperties, ur_program_handle_t *phProgram) { - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_device_handle_t hDevice = hContext->getDevice(); auto pBinary = reinterpret_cast(pIL); @@ -105,8 +103,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, const char *pOptions) { std::ignore = hContext; - UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - ur_result_t Result = UR_RESULT_SUCCESS; try { @@ -155,8 +151,6 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice, // Ignore unused parameter std::ignore = hDevice; - UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { @@ -176,8 +170,6 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice, UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, size_t propSize, void *pProgramInfo, size_t *pPropSizeRet) { - UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UrReturnHelper ReturnValue(propSize, pProgramInfo, pPropSizeRet); switch (propName) { @@ -205,7 +197,6 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, UR_APIEXPORT ur_result_t UR_APICALL urProgramRetain(ur_program_handle_t hProgram) { - UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hProgram->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_PROGRAM); hProgram->incrementReferenceCount(); return UR_RESULT_SUCCESS; @@ -216,8 +207,6 @@ urProgramRetain(ur_program_handle_t hProgram) { /// the context. UR_APIEXPORT ur_result_t UR_APICALL urProgramRelease(ur_program_handle_t hProgram) { - UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - // double delete or someone is messing with the ref count. // either way, cannot safely proceed. UR_ASSERT(hProgram->getReferenceCount() != 0, @@ -253,7 +242,6 @@ urProgramRelease(ur_program_handle_t hProgram) { /// \return UR_RESULT_SUCCESS UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( ur_program_handle_t hProgram, ur_native_handle_t *phNativeProgram) { - UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); *phNativeProgram = reinterpret_cast(hProgram->get()); return UR_RESULT_SUCCESS; } @@ -267,9 +255,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, const uint8_t *pBinary, const ur_program_properties_t *pProperties, ur_program_handle_t *phProgram) { - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY); UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), UR_RESULT_ERROR_INVALID_CONTEXT); @@ -308,11 +293,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( ur_device_handle_t hDevice, ur_program_handle_t hProgram, const char *pFunctionName, void **ppFunctionPointer) { // Check if device passed is the same the device bound to the context - UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hDevice == hProgram->getContext()->getDevice(), UR_RESULT_ERROR_INVALID_DEVICE); - UR_ASSERT(ppFunctionPointer, UR_RESULT_ERROR_INVALID_NULL_POINTER); hipFunction_t Func; hipError_t Ret = hipModuleGetFunction(&Func, hProgram->get(), pFunctionName); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp index 5040423b6d526..e8438c8df5c38 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp @@ -108,10 +108,6 @@ hipStream_t ur_queue_handle_t_::getNextTransferStream() { UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) { - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(phQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); - try { std::unique_ptr QueueImpl{nullptr}; @@ -123,7 +119,8 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, unsigned int Flags = 0; const bool IsOutOfOrder = - pProps->flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; + pProps ? pProps->flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE + : false; std::vector ComputeHipStreams( IsOutOfOrder ? ur_queue_handle_t_::DefaultNumComputeStreams : 1); @@ -132,7 +129,7 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, QueueImpl = std::unique_ptr(new ur_queue_handle_t_{ std::move(ComputeHipStreams), std::move(TransferHipStreams), hContext, - hDevice, Flags, pProps->flags}); + hDevice, Flags, pProps ? pProps->flags : 0}); *phQueue = QueueImpl.release(); @@ -149,8 +146,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, size_t propValueSize, void *pPropValue, size_t *pPropSizeRet) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); switch (propName) { case UR_QUEUE_INFO_CONTEXT: @@ -184,7 +179,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, } UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hQueue->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_QUEUE); hQueue->incrementReferenceCount(); @@ -192,8 +186,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { } UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - if (hQueue->decrementReferenceCount() > 0) { return UR_RESULT_SUCCESS; } @@ -217,8 +209,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { } UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - // set default result to a negative result (avoid false-positve tests) ur_result_t Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; @@ -257,8 +247,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, ur_native_handle_t *phNativeQueue) { std::ignore = pDesc; - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER); ScopedContext Active(hQueue->getContext()); *phNativeQueue = diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp index 9cc1520d13ade..2b60e225781a7 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp @@ -32,7 +32,6 @@ ur_result_t urSamplerCreate(ur_context_handle_t hContext, ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, ur_sampler_info_t propName, size_t propValueSize, void *pPropValue, size_t *pPropSizeRet) { - UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); switch (propName) { @@ -61,14 +60,11 @@ ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, } ur_result_t urSamplerRetain(ur_sampler_handle_t hSampler) { - UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); hSampler->incrementReferenceCount(); return UR_RESULT_SUCCESS; } ur_result_t urSamplerRelease(ur_sampler_handle_t hSampler) { - UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - // double delete or someone is messing with the ref count. // either way, cannot safely proceed. sycl::detail::ur::assertion( diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp index 8849dd59d23b1..3ffc53cccdb51 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp @@ -17,8 +17,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); size_t DeviceMaxMemAllocSize = 0; UR_ASSERT(urDeviceGetInfo(hContext->getDevice(), @@ -53,10 +51,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - size_t DeviceMaxMemAllocSize = 0; UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t), @@ -89,10 +83,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - size_t DeviceMaxMemAllocSize = 0; UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t), @@ -123,8 +113,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( /// USM: Frees the given USM pointer associated with the context. UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, void *pMem) { - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); ur_result_t Result = UR_RESULT_SUCCESS; try { ScopedContext Active(hContext); @@ -151,9 +139,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, ur_usm_alloc_info_t propName, size_t propValueSize, void *pPropValue, size_t *pPropValueSizeRet) { - UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - ur_result_t Result = UR_RESULT_SUCCESS; hipPointerAttribute_t hipPointerAttributeType; From 61f93dbfd5ef57bfa7007537ba1cc4715d170e44 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Tue, 20 Jun 2023 12:21:13 +0100 Subject: [PATCH 28/42] [SYCL][UR][HIP] Fix some build warnings --- sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp | 8 ++++++-- sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp | 1 + sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp | 8 ++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp index 8c76803accd8a..9cfa8ac721fd8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp @@ -71,6 +71,8 @@ void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock, assert(ThreadsPerBlock != nullptr); assert(GlobalWorkSize != nullptr); assert(Kernel != nullptr); + + std::ignore = Kernel; // int recommendedBlockSize, minGrid; // UR_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize( @@ -1297,10 +1299,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_queue_handle_t hQueue, const void *pMem, size_t size, ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + void *HIPDevicePtr = const_cast(pMem); unsigned int PointerRangeSize = 0; UR_CHECK_ERROR(hipPointerGetAttribute(&PointerRangeSize, HIP_POINTER_ATTRIBUTE_RANGE_SIZE, - (hipDeviceptr_t)pMem)); + (hipDeviceptr_t)HIPDevicePtr)); UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); // flags is currently unused so fail if set @@ -1338,10 +1341,11 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) { std::ignore = advice; + void *HIPDevicePtr = const_cast(pMem); unsigned int PointerRangeSize = 0; UR_CHECK_ERROR(hipPointerGetAttribute(&PointerRangeSize, HIP_POINTER_ATTRIBUTE_RANGE_SIZE, - (hipDeviceptr_t)pMem)); + (hipDeviceptr_t)HIPDevicePtr)); UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); // TODO implement a mapping to hipMemAdvise once the expected behaviour diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp index 130703743b1e1..a1883d80975be 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp @@ -13,6 +13,7 @@ hipEvent_t ur_platform_handle_t_::EvBase{nullptr}; UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName, size_t propSize, void *pPropValue, size_t *pSizeRet) { + std::ignore = hPlatform; UrReturnHelper ReturnValue(propSize, pPropValue, pSizeRet); switch (propName) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp index ac6d92b1b45f2..4fbbc07d69577 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp @@ -120,6 +120,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramLink(ur_context_handle_t hContext, uint32_t count, const ur_program_handle_t *phPrograms, const char *pOptions, ur_program_handle_t *phProgram) { + std::ignore = hContext; + std::ignore = count; + std::ignore = phPrograms; + std::ignore = pOptions; + std::ignore = phProgram; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -255,6 +261,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, const uint8_t *pBinary, const ur_program_properties_t *pProperties, ur_program_handle_t *phProgram) { + std::ignore = pProperties; + UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY); UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), UR_RESULT_ERROR_INVALID_CONTEXT); From c0e2ada5798257bade504060425704767b6c465a Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Wed, 21 Jun 2023 09:56:15 +0100 Subject: [PATCH 29/42] [SYCL][HIP] Fix device info return type for USM queries --- .../plugins/unified_runtime/ur/adapters/hip/device.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index b239572268e6c..b4c5fb06d4194 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -606,7 +606,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // // query if/how the device can access page-locked host memory, possibly // through PCIe, using the same pointer as the host - uint64_t Value = {}; + ur_device_usm_access_capability_flags_t Value = {}; // if (getAttribute(device, HIP_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) { // the device shares a unified address space with the host if (getAttribute(hDevice, hipDeviceAttributeComputeCapabilityMajor) >= 6) { @@ -631,7 +631,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // associated with this device." // // query how the device can access memory allocated on the device itself (?) - uint64_t Value = + ur_device_usm_access_capability_flags_t Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | @@ -644,7 +644,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // allocation associated with this device." // // query if/how the device can access managed memory associated to it - uint64_t Value = {}; + ur_device_usm_access_capability_flags_t Value = {}; if (getAttribute(hDevice, hipDeviceAttributeManagedMemory)) { // the device can allocate managed memory on this system Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | @@ -672,7 +672,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // // query if/how the device can access managed memory associated to other // devices - uint64_t Value = {}; + ur_device_usm_access_capability_flags_t Value = {}; if (getAttribute(hDevice, hipDeviceAttributeManagedMemory)) { // the device can allocate managed memory on this system Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS; @@ -700,7 +700,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // // query if/how the device can access pageable host memory allocated by the // system allocator - uint64_t Value = {}; + ur_device_usm_access_capability_flags_t Value = {}; if (getAttribute(hDevice, hipDeviceAttributePageableMemoryAccess)) { // the link between the device and the host does not support native // atomic operations From 29a8cc0e329fd8cbc2bfa4be89aa9ab00aefd82d Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Fri, 23 Jun 2023 10:30:58 +0100 Subject: [PATCH 30/42] [SYCL][UR][HIP] Migrate blocking until the event is ready fuctionality --- .../unified_runtime/ur/adapters/hip/event.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp index ee9518454ce3f..616ea2d7e9a36 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp @@ -82,6 +82,11 @@ uint64_t ur_event_handle_t_::getQueuedTime() const { float MiliSeconds = 0.0f; assert(isStarted()); + // hipEventSynchronize waits till the event is ready for call to + // hipEventElapsedTime. + UR_CHECK_ERROR(hipEventSynchronize(evStart_)); + UR_CHECK_ERROR(hipEventSynchronize(evEnd_)); + UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds, EvStart, EventEnd)); return static_cast(MiliSeconds * 1.0e6); } @@ -90,6 +95,11 @@ uint64_t ur_event_handle_t_::getStartTime() const { float MiliSeconds = 0.0f; assert(isStarted()); + // hipEventSynchronize waits till the event is ready for call to + // hipEventElapsedTime. + UR_CHECK_ERROR(hipEventSynchronize(_pi_platform::evBase_)); + UR_CHECK_ERROR(hipEventSynchronize(evStart_)); + UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds, ur_platform_handle_t_::EvBase, EvStart)); return static_cast(MiliSeconds * 1.0e6); @@ -99,6 +109,11 @@ uint64_t ur_event_handle_t_::getEndTime() const { float MiliSeconds = 0.0f; assert(isStarted() && isRecorded()); + // hipEventSynchronize waits till the event is ready for call to + // hipEventElapsedTime. + UR_CHECK_ERROR(hipEventSynchronize(_pi_platform::evBase_)); + UR_CHECK_ERROR(hipEventSynchronize(evEnd_)); + UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds, ur_platform_handle_t_::EvBase, EventEnd)); return static_cast(MiliSeconds * 1.0e6); From ea9aefefb85d015d9040e20fab4a1a5a686ed76a Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Fri, 23 Jun 2023 11:18:17 +0100 Subject: [PATCH 31/42] [SYCL][UR][HIP] Update unified-runtime --- .../ur/adapters/hip/device.cpp | 4 +-- .../unified_runtime/ur/adapters/hip/event.cpp | 30 +++++++++---------- .../unified_runtime/ur/adapters/hip/event.hpp | 6 ++-- .../ur/adapters/hip/kernel.cpp | 7 +++-- .../ur/adapters/hip/ur_interface_loader.cpp | 2 +- 5 files changed, 26 insertions(+), 23 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index b4c5fb06d4194..6e0cc647d1f10 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -589,7 +589,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: { return ReturnValue(0u); } - case UR_DEVICE_INFO_PARTITION_PROPERTIES: { + case UR_DEVICE_INFO_SUPPORTED_PARTITIONS: { return ReturnValue(static_cast(0u)); } case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: { @@ -866,7 +866,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t hDevice) { } UR_APIEXPORT ur_result_t UR_APICALL -urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *, +urDevicePartition(ur_device_handle_t, const ur_device_partition_properties_t *, uint32_t, ur_device_handle_t *, uint32_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp index 616ea2d7e9a36..ebdc612ff30b0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp @@ -16,14 +16,14 @@ ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type, ur_queue_handle_t Queue, hipStream_t Stream, uint32_t StreamToken) : CommandType{Type}, RefCount{1}, HasBeenWaitedOn{false}, IsRecorded{false}, - IsStarted{false}, StreamToken{StreamToken}, EventEnd{nullptr}, + IsStarted{false}, StreamToken{StreamToken}, EvEnd{nullptr}, EvStart{nullptr}, EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} { bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE; UR_CHECK_ERROR(hipEventCreateWithFlags( - &EventEnd, ProfilingEnabled ? hipEventDefault : hipEventDisableTiming)); + &EvEnd, ProfilingEnabled ? hipEventDefault : hipEventDisableTiming)); if (ProfilingEnabled) { UR_CHECK_ERROR(hipEventCreateWithFlags(&EvQueued, hipEventDefault)); @@ -66,7 +66,7 @@ bool ur_event_handle_t_::isCompleted() const noexcept { return false; } if (!HasBeenWaitedOn) { - const hipError_t Result = hipEventQuery(EventEnd); + const hipError_t Result = hipEventQuery(EvEnd); if (Result != hipSuccess && Result != hipErrorNotReady) { UR_CHECK_ERROR(Result); return false; @@ -84,10 +84,10 @@ uint64_t ur_event_handle_t_::getQueuedTime() const { // hipEventSynchronize waits till the event is ready for call to // hipEventElapsedTime. - UR_CHECK_ERROR(hipEventSynchronize(evStart_)); - UR_CHECK_ERROR(hipEventSynchronize(evEnd_)); + UR_CHECK_ERROR(hipEventSynchronize(EvStart)); + UR_CHECK_ERROR(hipEventSynchronize(EvEnd)); - UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds, EvStart, EventEnd)); + UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds, EvStart, EvEnd)); return static_cast(MiliSeconds * 1.0e6); } @@ -97,8 +97,8 @@ uint64_t ur_event_handle_t_::getStartTime() const { // hipEventSynchronize waits till the event is ready for call to // hipEventElapsedTime. - UR_CHECK_ERROR(hipEventSynchronize(_pi_platform::evBase_)); - UR_CHECK_ERROR(hipEventSynchronize(evStart_)); + UR_CHECK_ERROR(hipEventSynchronize(ur_platform_handle_t_::EvBase)); + UR_CHECK_ERROR(hipEventSynchronize(EvStart)); UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds, ur_platform_handle_t_::EvBase, EvStart)); @@ -111,11 +111,11 @@ uint64_t ur_event_handle_t_::getEndTime() const { // hipEventSynchronize waits till the event is ready for call to // hipEventElapsedTime. - UR_CHECK_ERROR(hipEventSynchronize(_pi_platform::evBase_)); - UR_CHECK_ERROR(hipEventSynchronize(evEnd_)); + UR_CHECK_ERROR(hipEventSynchronize(ur_platform_handle_t_::EvBase)); + UR_CHECK_ERROR(hipEventSynchronize(EvEnd)); - UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds, - ur_platform_handle_t_::EvBase, EventEnd)); + UR_CHECK_ERROR( + hipEventElapsedTime(&MiliSeconds, ur_platform_handle_t_::EvBase, EvEnd)); return static_cast(MiliSeconds * 1.0e6); } @@ -135,7 +135,7 @@ ur_result_t ur_event_handle_t_::record() { sycl::detail::ur::die( "Unrecoverable program state reached in event identifier overflow"); } - Result = UR_CHECK_ERROR(hipEventRecord(EventEnd, Stream)); + Result = UR_CHECK_ERROR(hipEventRecord(EvEnd, Stream)); } catch (ur_result_t Error) { Result = Error; } @@ -150,7 +150,7 @@ ur_result_t ur_event_handle_t_::record() { ur_result_t ur_event_handle_t_::wait() { ur_result_t Result; try { - Result = UR_CHECK_ERROR(hipEventSynchronize(EventEnd)); + Result = UR_CHECK_ERROR(hipEventSynchronize(EvEnd)); HasBeenWaitedOn = true; } catch (ur_result_t Error) { Result = Error; @@ -161,7 +161,7 @@ ur_result_t ur_event_handle_t_::wait() { ur_result_t ur_event_handle_t_::release() { assert(Queue != nullptr); - UR_CHECK_ERROR(hipEventDestroy(EventEnd)); + UR_CHECK_ERROR(hipEventDestroy(EvEnd)); if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { UR_CHECK_ERROR(hipEventDestroy(EvQueued)); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp index d77b080909de9..6311c942b4a61 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp @@ -22,7 +22,7 @@ struct ur_event_handle_t_ { ur_result_t start(); - native_type get() const noexcept { return EventEnd; }; + native_type get() const noexcept { return EvEnd; }; ur_queue_handle_t getQueue() const noexcept { return Queue; } @@ -108,8 +108,8 @@ struct ur_event_handle_t_ { uint32_t StreamToken; uint32_t EventId; // Queue identifier of the event. - native_type EventEnd; // HIP event handle. If this ur_event_handle_t_ - // represents a user event, this will be nullptr. + native_type EvEnd; // HIP event handle. If this ur_event_handle_t_ + // represents a user event, this will be nullptr. native_type EvStart; // HIP event handle associated with the start diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp index fb494dd2600da..924da7b9914d8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp @@ -262,8 +262,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( - ur_kernel_handle_t hKernel, uint32_t argIndex, ur_mem_handle_t hArgValue) { +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_mem_obj_properties_t *pProperties, + ur_mem_handle_t hArgValue) { + std::ignore = pProperties; // Below sets kernel arg when zero-sized buffers are handled. // In such case the corresponding memory is null. if (hArgValue == nullptr) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index c7258ad241373..6d4a8eadda747 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -200,7 +200,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnGetLastResult = urGetLastResult; + pDdiTable->pfnInit = urInit; pDdiTable->pfnTearDown = urTearDown; return UR_RESULT_SUCCESS; From 20d5432421a6bac0cd71746aa8bdf296cd4cfcd8 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 29 Jun 2023 14:38:23 +0100 Subject: [PATCH 32/42] [SYCL][HIP][UR] Address feedback --- .../ur/adapters/hip/device.cpp | 1 - .../ur/adapters/hip/program.cpp | 2 +- .../unified_runtime/ur/adapters/hip/usm.cpp | 63 +++++++------------ 3 files changed, 24 insertions(+), 42 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index 6e0cc647d1f10..7d2b94ea9dea7 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -720,7 +720,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_ATOMIC_64: { - // TODO: Reconsider it when AMD supports SYCL_USE_NATIVE_FP_ATOMICS. hipDeviceProp_t Props; sycl::detail::ur::assertion( hipGetDeviceProperties(&Props, hDevice->get()) == hipSuccess); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp index 4fbbc07d69577..80588f10aaa98 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp @@ -317,4 +317,4 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( } return Result; -} \ No newline at end of file +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp index 3ffc53cccdb51..b27ffc969c484 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp @@ -18,14 +18,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - size_t DeviceMaxMemAllocSize = 0; - UR_ASSERT(urDeviceGetInfo(hContext->getDevice(), - UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t), - static_cast(&DeviceMaxMemAllocSize), - nullptr) == UR_RESULT_SUCCESS, - UR_RESULT_ERROR_INVALID_DEVICE); - UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize, - UR_RESULT_ERROR_INVALID_USM_SIZE); + UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || + ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); ur_result_t Result = UR_RESULT_SUCCESS; try { @@ -35,13 +30,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( Result = Error; } - UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || - ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), - UR_RESULT_ERROR_INVALID_VALUE); - - assert(Result == UR_RESULT_SUCCESS && - (!pUSMDesc || pUSMDesc->align == 0 || - reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); + if (Result == UR_RESULT_SUCCESS) { + assert((!pUSMDesc || pUSMDesc->align == 0 || + reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); + } return Result; } @@ -51,14 +43,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - size_t DeviceMaxMemAllocSize = 0; - UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, - sizeof(size_t), - static_cast(&DeviceMaxMemAllocSize), - nullptr) == UR_RESULT_SUCCESS, - UR_RESULT_ERROR_INVALID_DEVICE); - UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize, - UR_RESULT_ERROR_INVALID_USM_SIZE); + + UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || + ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); ur_result_t Result = UR_RESULT_SUCCESS; try { @@ -67,13 +55,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( } catch (ur_result_t Error) { Result = Error; } - UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || - ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), - UR_RESULT_ERROR_INVALID_VALUE); - assert(Result == UR_RESULT_SUCCESS && - (!pUSMDesc || pUSMDesc->align == 0 || - reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); + if (Result == UR_RESULT_SUCCESS) { + assert((!pUSMDesc || pUSMDesc->align == 0 || + reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); + } return Result; } @@ -83,14 +69,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - size_t DeviceMaxMemAllocSize = 0; - UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, - sizeof(size_t), - static_cast(&DeviceMaxMemAllocSize), - nullptr) == UR_RESULT_SUCCESS, - UR_RESULT_ERROR_INVALID_DEVICE); - UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize, - UR_RESULT_ERROR_INVALID_USM_SIZE); + + UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || + ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); ur_result_t Result = UR_RESULT_SUCCESS; try { @@ -103,9 +85,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), UR_RESULT_ERROR_INVALID_VALUE); - assert(Result == UR_RESULT_SUCCESS && - (!pUSMDesc || pUSMDesc->align == 0 || - reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); + if (Result == UR_RESULT_SUCCESS) { + assert((!pUSMDesc || pUSMDesc->align == 0 || + reinterpret_cast(*ppMem) % pUSMDesc->align == 0)); + } return Result; } From 5438314ccd80335dbd62d98551baedc4284b9cac Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 29 Jun 2023 14:59:32 +0100 Subject: [PATCH 33/42] [SYCL][HIP][UR] Change return code for unsupported context info queries --- sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp index f08cd8cb2c43c..c3e34df5ee517 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp @@ -82,9 +82,7 @@ urContextGetInfo(ur_context_handle_t hContext, ur_context_info_t propName, case UR_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { // These queries should be dealt with in context_impl.cpp by calling the // queries of each device separately and building the intersection set. - setErrorMessage("These queries should have never come here.", - UR_RESULT_ERROR_INVALID_ARGUMENT); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + return UR_RESULT_ERROR_INVALID_ENUMERATION; } case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: // 2D USM memcpy is supported. From 6d0b398cd176a95063a83413e681e38e30f67b7b Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 29 Jun 2023 15:31:44 +0100 Subject: [PATCH 34/42] [SYCL][HIP][UR] Add pProp to kernelSet* entry-points --- .../ur/adapters/hip/device.cpp | 2 +- .../ur/adapters/hip/kernel.cpp | 25 +++++++++++++------ .../unified_runtime/ur/adapters/hip/usm.cpp | 2 ++ 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index 7d2b94ea9dea7..6e07390a4361a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -807,7 +807,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, #endif return UR_RESULT_ERROR_INVALID_VALUE; } - case UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: { + case UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: { // Maximum number of 32-bit registers available to a thread block. // Note: This number is shared by all thread blocks simultaneously resident // on a multiprocessor. diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp index 924da7b9914d8..8c28dd86fd530 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp @@ -166,9 +166,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL -urKernelSetArgValue(ur_kernel_handle_t hKernel, uint32_t argIndex, - size_t argSize, const void *pArgValue) { +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( + ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, + const ur_kernel_arg_value_properties_t *pProperties, + const void *pArgValue) { + std::ignore = pProperties; ur_result_t Result = UR_RESULT_SUCCESS; try { if (pArgValue) { @@ -256,8 +258,11 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, return UR_RESULT_ERROR_INVALID_ENUMERATION; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( - ur_kernel_handle_t hKernel, uint32_t argIndex, const void *pArgValue) { +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetArgPointer(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_pointer_properties_t *pProperties, + const void *pArgValue) { + std::ignore = pProperties; hKernel->setKernelArg(argIndex, sizeof(pArgValue), pArgValue); return UR_RESULT_SUCCESS; } @@ -304,7 +309,9 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_sampler_properties_t *pProperties, ur_sampler_handle_t hArgValue) { + std::ignore = pProperties; ur_result_t Result = UR_RESULT_SUCCESS; try { uint32_t SamplerProps = hArgValue->Props; @@ -316,12 +323,14 @@ urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, } // A NOP for the HIP backend -UR_APIEXPORT ur_result_t UR_APICALL -urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, - size_t propSize, const void *pPropValue) { +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( + ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, size_t propSize, + const ur_kernel_exec_info_properties_t *pProperties, + const void *pPropValue) { std::ignore = hKernel; std::ignore = propName; std::ignore = propSize; + std::ignore = pProperties; std::ignore = pPropValue; return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp index b27ffc969c484..daca6c6061c09 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp @@ -43,6 +43,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool, size_t size, void **ppMem) { + std::ignore = hDevice; UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), @@ -69,6 +70,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool, size_t size, void **ppMem) { + std::ignore = hDevice; UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), From ea12dbd4d9aa6b82b8a3668e23a6ebf8d505b269 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 29 Jun 2023 16:02:50 +0100 Subject: [PATCH 35/42] [SYCL][HIP][UR] Remove sycl dependencies from hip ur adapter --- .../ur/adapters/hip/common.cpp | 6 +- .../ur/adapters/hip/common.hpp | 8 +- .../ur/adapters/hip/device.cpp | 232 +++++++++--------- .../ur/adapters/hip/enqueue.cpp | 2 +- .../unified_runtime/ur/adapters/hip/event.cpp | 11 +- .../ur/adapters/hip/kernel.cpp | 58 ++--- .../ur/adapters/hip/memory.cpp | 3 +- .../ur/adapters/hip/sampler.cpp | 2 +- 8 files changed, 156 insertions(+), 166 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp index ebc3bdd02033e..41c18b798db07 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp @@ -70,17 +70,17 @@ std::string getHipVersionString() { return Stream.str(); } -void sycl::detail::ur::die(const char *pMessage) { +void detail::ur::die(const char *pMessage) { std::cerr << "ur_die: " << pMessage << std::endl; std::terminate(); } -void sycl::detail::ur::assertion(bool Condition, const char *pMessage) { +void detail::ur::assertion(bool Condition, const char *pMessage) { if (!Condition) die(pMessage); } -void sycl::detail::ur::hipPrint(const char *pMessage) { +void detail::ur::hipPrint(const char *pMessage) { std::cerr << "ur_print: " << pMessage << std::endl; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp index 7d010c4a6ac93..c3d3a6e23dd18 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp @@ -8,7 +8,6 @@ #pragma once #include -#include #include // Hipify doesn't support cuArrayGetDescriptor, on AMD the hipArray can just be @@ -85,8 +84,6 @@ extern thread_local char ErrorMessage[MaxMessageSize]; ur_result_t ErrorCode); /// ------ Error handling, matching OpenCL plugin semantics. -namespace sycl { -__SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { namespace ur { @@ -103,8 +100,6 @@ void assertion(bool Condition, const char *pMessage = nullptr); } // namespace ur } // namespace detail -} // __SYCL_INLINE_VER_NAMESPACE(_V1) -} // namespace sycl /// RAII object that calls the reference count release function on the held UR /// object on destruction. @@ -162,8 +157,7 @@ template class ReleaseGuard { // HIP error for which it is unclear if the function that reported it // succeeded or not. Either way, the state of the program is compromised // and likely unrecoverable. - sycl::detail::ur::die( - "Unrecoverable program state reached in piMemRelease"); + detail::ur::die("Unrecoverable program state reached in piMemRelease"); } } } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index 6e07390a4361a..e0fa5e294a641 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -14,7 +14,7 @@ int getAttribute(ur_device_handle_t Device, hipDeviceAttribute_t Attribute) { int Value; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&Value, Attribute, Device->get()) == hipSuccess); return Value; } @@ -44,11 +44,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { int ComputeUnits = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&ComputeUnits, hipDeviceAttributeMultiprocessorCount, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(ComputeUnits >= 0); + detail::ur::assertion(ComputeUnits >= 0); return ReturnValue(static_cast(ComputeUnits)); } case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { @@ -60,20 +60,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } return_sizes; int MaxX = 0, MaxY = 0, MaxZ = 0; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&MaxX, hipDeviceAttributeMaxBlockDimX, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(MaxX >= 0); + detail::ur::assertion(hipDeviceGetAttribute(&MaxX, + hipDeviceAttributeMaxBlockDimX, + hDevice->get()) == hipSuccess); + detail::ur::assertion(MaxX >= 0); - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&MaxY, hipDeviceAttributeMaxBlockDimY, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(MaxY >= 0); + detail::ur::assertion(hipDeviceGetAttribute(&MaxY, + hipDeviceAttributeMaxBlockDimY, + hDevice->get()) == hipSuccess); + detail::ur::assertion(MaxY >= 0); - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&MaxZ, hipDeviceAttributeMaxBlockDimZ, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(MaxZ >= 0); + detail::ur::assertion(hipDeviceGetAttribute(&MaxZ, + hipDeviceAttributeMaxBlockDimZ, + hDevice->get()) == hipSuccess); + detail::ur::assertion(MaxZ >= 0); return_sizes.sizes[0] = size_t(MaxX); return_sizes.sizes[1] = size_t(MaxY); @@ -87,20 +87,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } return_sizes; int MaxX = 0, MaxY = 0, MaxZ = 0; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&MaxX, hipDeviceAttributeMaxGridDimX, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(MaxX >= 0); + detail::ur::assertion(hipDeviceGetAttribute(&MaxX, + hipDeviceAttributeMaxGridDimX, + hDevice->get()) == hipSuccess); + detail::ur::assertion(MaxX >= 0); - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&MaxY, hipDeviceAttributeMaxGridDimY, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(MaxY >= 0); + detail::ur::assertion(hipDeviceGetAttribute(&MaxY, + hipDeviceAttributeMaxGridDimY, + hDevice->get()) == hipSuccess); + detail::ur::assertion(MaxY >= 0); - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&MaxZ, hipDeviceAttributeMaxGridDimZ, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(MaxZ >= 0); + detail::ur::assertion(hipDeviceGetAttribute(&MaxZ, + hipDeviceAttributeMaxGridDimZ, + hDevice->get()) == hipSuccess); + detail::ur::assertion(MaxZ >= 0); return_sizes.sizes[0] = size_t(MaxX); return_sizes.sizes[1] = size_t(MaxY); @@ -110,12 +110,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { int MaxWorkGroupSize = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&MaxWorkGroupSize, hipDeviceAttributeMaxThreadsPerBlock, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(MaxWorkGroupSize >= 0); + detail::ur::assertion(MaxWorkGroupSize >= 0); return ReturnValue(size_t(MaxWorkGroupSize)); } @@ -164,13 +164,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { // Number of sub-groups = max block size / warp size + possible remainder int MaxThreads = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&MaxThreads, hipDeviceAttributeMaxThreadsPerBlock, hDevice->get()) == hipSuccess); int WarpSize = 0; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, - hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetAttribute(&WarpSize, + hipDeviceAttributeWarpSize, + hDevice->get()) == hipSuccess); int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; return ReturnValue(MaxWarps); } @@ -178,7 +178,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // Volta provides independent thread scheduling // TODO: Revisit for previous generation GPUs int Major = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&Major, hipDeviceAttributeComputeCapabilityMajor, hDevice->get()) == hipSuccess); bool IFP = (Major >= 7); @@ -186,18 +186,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { int WarpSize = 0; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, - hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetAttribute(&WarpSize, + hipDeviceAttributeWarpSize, + hDevice->get()) == hipSuccess); size_t Sizes[1] = {static_cast(WarpSize)}; return ReturnValue(Sizes, 1); } case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { int ClockFreq = 0; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&ClockFreq, hipDeviceAttributeClockRate, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(ClockFreq >= 0); + detail::ur::assertion(hipDeviceGetAttribute(&ClockFreq, + hipDeviceAttributeClockRate, + hDevice->get()) == hipSuccess); + detail::ur::assertion(ClockFreq >= 0); return ReturnValue(static_cast(ClockFreq) / 1000u); } case UR_DEVICE_INFO_ADDRESS_BITS: { @@ -212,8 +212,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // CL_DEVICE_TYPE_CUSTOM. size_t Global = 0; - sycl::detail::ur::assertion(hipDeviceTotalMem(&Global, hDevice->get()) == - hipSuccess); + detail::ur::assertion(hipDeviceTotalMem(&Global, hDevice->get()) == + hipSuccess); auto QuarterGlobal = static_cast(Global / 4u); @@ -240,15 +240,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: { // Take the smaller of maximum surface and maximum texture height. int TexHeight = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&TexHeight, hipDeviceAttributeMaxTexture2DHeight, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(TexHeight >= 0); + detail::ur::assertion(TexHeight >= 0); int SurfHeight = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&SurfHeight, hipDeviceAttributeMaxTexture2DHeight, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(SurfHeight >= 0); + detail::ur::assertion(SurfHeight >= 0); int Min = std::min(TexHeight, SurfHeight); @@ -257,15 +257,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: { // Take the smaller of maximum surface and maximum texture width. int TexWidth = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&TexWidth, hipDeviceAttributeMaxTexture2DWidth, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(TexWidth >= 0); + detail::ur::assertion(TexWidth >= 0); int SurfWidth = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&SurfWidth, hipDeviceAttributeMaxTexture2DWidth, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(SurfWidth >= 0); + detail::ur::assertion(SurfWidth >= 0); int Min = std::min(TexWidth, SurfWidth); @@ -274,15 +274,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: { // Take the smaller of maximum surface and maximum texture height. int TexHeight = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&TexHeight, hipDeviceAttributeMaxTexture3DHeight, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(TexHeight >= 0); + detail::ur::assertion(TexHeight >= 0); int SurfHeight = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&SurfHeight, hipDeviceAttributeMaxTexture3DHeight, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(SurfHeight >= 0); + detail::ur::assertion(SurfHeight >= 0); int Min = std::min(TexHeight, SurfHeight); @@ -291,15 +291,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: { // Take the smaller of maximum surface and maximum texture width. int TexWidth = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&TexWidth, hipDeviceAttributeMaxTexture3DWidth, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(TexWidth >= 0); + detail::ur::assertion(TexWidth >= 0); int SurfWidth = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&SurfWidth, hipDeviceAttributeMaxTexture3DWidth, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(SurfWidth >= 0); + detail::ur::assertion(SurfWidth >= 0); int Min = std::min(TexWidth, SurfWidth); @@ -308,15 +308,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: { // Take the smaller of maximum surface and maximum texture depth. int TexDepth = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&TexDepth, hipDeviceAttributeMaxTexture3DDepth, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(TexDepth >= 0); + detail::ur::assertion(TexDepth >= 0); int SurfDepth = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&SurfDepth, hipDeviceAttributeMaxTexture3DDepth, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(SurfDepth >= 0); + detail::ur::assertion(SurfDepth >= 0); int Min = std::min(TexDepth, SurfDepth); @@ -325,15 +325,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: { // Take the smaller of maximum surface and maximum texture width. int TexWidth = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&TexWidth, hipDeviceAttributeMaxTexture1DWidth, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(TexWidth >= 0); + detail::ur::assertion(TexWidth >= 0); int SurfWidth = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&SurfWidth, hipDeviceAttributeMaxTexture1DWidth, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(SurfWidth >= 0); + detail::ur::assertion(SurfWidth >= 0); int Min = std::min(TexWidth, SurfWidth); @@ -354,7 +354,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { int MemBaseAddrAlign = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&MemBaseAddrAlign, hipDeviceAttributeTextureAlignment, hDevice->get()) == hipSuccess); @@ -395,18 +395,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: { int CacheSize = 0; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&CacheSize, hipDeviceAttributeL2CacheSize, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(CacheSize >= 0); + detail::ur::assertion(hipDeviceGetAttribute(&CacheSize, + hipDeviceAttributeL2CacheSize, + hDevice->get()) == hipSuccess); + detail::ur::assertion(CacheSize >= 0); // The L2 cache is global to the GPU. return ReturnValue(static_cast(CacheSize)); } case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { size_t Bytes = 0; // Runtime API has easy access to this value, driver API info is scarse. - sycl::detail::ur::assertion(hipDeviceTotalMem(&Bytes, hDevice->get()) == - hipSuccess); + detail::ur::assertion(hipDeviceTotalMem(&Bytes, hDevice->get()) == + hipSuccess); return ReturnValue(uint64_t{Bytes}); } case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { @@ -416,11 +416,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // memory on AMD GPU may be larger than what can fit in the positive part // of a signed integer, so use an unsigned integer and cast the pointer to // int*. - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&ConstantMemory, hipDeviceAttributeTotalConstantMemory, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(ConstantMemory >= 0); + detail::ur::assertion(ConstantMemory >= 0); return ReturnValue(static_cast(ConstantMemory)); } @@ -438,30 +438,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // HIP has its own definition of "local memory", which maps to OpenCL's // "private memory". int LocalMemSize = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&LocalMemSize, hipDeviceAttributeMaxSharedMemoryPerBlock, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(LocalMemSize >= 0); + detail::ur::assertion(LocalMemSize >= 0); return ReturnValue(static_cast(LocalMemSize)); } case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { int EccEnabled = 0; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&EccEnabled, hipDeviceAttributeEccEnabled, - hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetAttribute(&EccEnabled, + hipDeviceAttributeEccEnabled, + hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion((EccEnabled == 0) | (EccEnabled == 1)); + detail::ur::assertion((EccEnabled == 0) | (EccEnabled == 1)); auto Result = static_cast(EccEnabled); return ReturnValue(Result); } case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: { int IsIntegrated = 0; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&IsIntegrated, hipDeviceAttributeIntegrated, - hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetAttribute(&IsIntegrated, + hipDeviceAttributeIntegrated, + hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); + detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); auto Result = static_cast(IsIntegrated); return ReturnValue(Result); } @@ -513,14 +513,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_NAME: { static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u; char Name[MAX_DEVICE_NAME_LENGTH]; - sycl::detail::ur::assertion(hipDeviceGetName(Name, MAX_DEVICE_NAME_LENGTH, - hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetName(Name, MAX_DEVICE_NAME_LENGTH, + hDevice->get()) == hipSuccess); // On AMD GPUs hipDeviceGetName returns an empty string, so return the arch // name instead, this is also what AMD OpenCL devices return. if (strlen(Name) == 0) { hipDeviceProp_t Props; - sycl::detail::ur::assertion( - hipGetDeviceProperties(&Props, hDevice->get()) == hipSuccess); + detail::ur::assertion(hipGetDeviceProperties(&Props, hDevice->get()) == + hipSuccess); return ReturnValue(Props.gcnArchName, strlen(Props.gcnArchName) + 1); } @@ -543,8 +543,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, std::stringstream S; hipDeviceProp_t Props; - sycl::detail::ur::assertion( - hipGetDeviceProperties(&Props, hDevice->get()) == hipSuccess); + detail::ur::assertion(hipGetDeviceProperties(&Props, hDevice->get()) == + hipSuccess); #if defined(__HIP_PLATFORM_NVIDIA__) S << Props.major << "." << Props.minor; #elif defined(__HIP_PLATFORM_AMD__) @@ -567,8 +567,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, SupportedExtensions += " "; hipDeviceProp_t Props; - sycl::detail::ur::assertion( - hipGetDeviceProperties(&Props, hDevice->get()) == hipSuccess); + detail::ur::assertion(hipGetDeviceProperties(&Props, hDevice->get()) == + hipSuccess); if (Props.arch.hasDoubles) { SupportedExtensions += "cl_khr_fp64 "; @@ -712,17 +712,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: { int Major = 0, Minor = 0; - sycl::detail::ur::assertion( - hipDeviceComputeCapability(&Major, &Minor, hDevice->get()) == - hipSuccess); + detail::ur::assertion(hipDeviceComputeCapability( + &Major, &Minor, hDevice->get()) == hipSuccess); std::string Result = std::to_string(Major) + "." + std::to_string(Minor); return ReturnValue(Result.c_str()); } case UR_DEVICE_INFO_ATOMIC_64: { hipDeviceProp_t Props; - sycl::detail::ur::assertion( - hipGetDeviceProperties(&Props, hDevice->get()) == hipSuccess); + detail::ur::assertion(hipGetDeviceProperties(&Props, hDevice->get()) == + hipSuccess); return ReturnValue(Props.arch.hasGlobalInt64Atomics && Props.arch.hasSharedInt64Atomics); } @@ -730,28 +729,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { size_t FreeMemory = 0; size_t TotalMemory = 0; - sycl::detail::ur::assertion(hipMemGetInfo(&FreeMemory, &TotalMemory) == - hipSuccess, - "failed hipMemGetInfo() API."); + detail::ur::assertion(hipMemGetInfo(&FreeMemory, &TotalMemory) == + hipSuccess, + "failed hipMemGetInfo() API."); return ReturnValue(FreeMemory); } case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { int Value = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&Value, hipDeviceAttributeMemoryClockRate, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(Value >= 0); + detail::ur::assertion(Value >= 0); // Convert kilohertz to megahertz when returning. return ReturnValue(Value / 1000); } case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: { int Value = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipDeviceGetAttribute(&Value, hipDeviceAttributeMemoryBusWidth, hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(Value >= 0); + detail::ur::assertion(Value >= 0); return ReturnValue(Value); } case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { @@ -788,10 +787,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_DEVICE_ID: { int Value = 0; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&Value, hipDeviceAttributePciDeviceId, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion(Value >= 0); + detail::ur::assertion(hipDeviceGetAttribute(&Value, + hipDeviceAttributePciDeviceId, + hDevice->get()) == hipSuccess); + detail::ur::assertion(Value >= 0); return ReturnValue(Value); } case UR_DEVICE_INFO_UUID: { @@ -799,8 +798,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, HIP_VERSION_MAJOR > 5) hipUUID UUID = {}; // Supported since 5.2+ - sycl::detail::ur::assertion(hipDeviceGetUuid(&UUID, hDevice->get()) == - hipSuccess); + detail::ur::assertion(hipDeviceGetUuid(&UUID, hDevice->get()) == + hipSuccess); std::array Name; std::copy(UUID.bytes, UUID.bytes + 16, Name.begin()); return ReturnValue(Name.data(), 16); @@ -815,7 +814,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, UR_CHECK_ERROR(hipDeviceGetAttribute( &MaxRegisters, hipDeviceAttributeMaxRegistersPerBlock, hDevice->get())); - sycl::detail::ur::assertion(MaxRegisters >= 0); + detail::ur::assertion(MaxRegisters >= 0); return ReturnValue(static_cast(MaxRegisters)); } @@ -826,15 +825,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_PCI_ADDRESS: { constexpr size_t AddressBufferSize = 13; char AddressBuffer[AddressBufferSize]; - sycl::detail::ur::assertion( - hipDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, - hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, + hDevice->get()) == hipSuccess); // A typical PCI address is 12 bytes + \0: "1234:67:90.2", but the HIP API // is not guaranteed to use this format. In practice, it uses this format, // at least in 5.3-5.5. To be on the safe side, we make sure the terminating // \0 is set. AddressBuffer[AddressBufferSize - 1] = '\0'; - sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) > 0); + detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) > 0); return ReturnValue(AddressBuffer, strnlen(AddressBuffer, AddressBufferSize - 1) + 1); } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp index 9cfa8ac721fd8..770f1d9601eb9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp @@ -29,7 +29,7 @@ static size_t imageElementByteSize(hipArray_Format ArrayFormat) { case HIP_AD_FORMAT_FLOAT: return 4; default: - sycl::detail::ur::die("Invalid image format."); + detail::ur::die("Invalid image format."); } return 0; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp index ebdc612ff30b0..f5f7daa14d410 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp @@ -132,7 +132,7 @@ ur_result_t ur_event_handle_t_::record() { try { EventId = Queue->getNextEventId(); if (EventId == 0) { - sycl::detail::ur::die( + detail::ur::die( "Unrecoverable program state reached in event identifier overflow"); } Result = UR_CHECK_ERROR(hipEventRecord(EvEnd, Stream)); @@ -264,8 +264,8 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus, UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { const auto RefCount = hEvent->incrementReferenceCount(); - sycl::detail::ur::assertion( - RefCount != 0, "Reference count overflow detected in urEventRetain."); + detail::ur::assertion(RefCount != 0, + "Reference count overflow detected in urEventRetain."); return UR_RESULT_SUCCESS; } @@ -273,9 +273,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { // double delete or someone is messing with the ref count. // either way, cannot safely proceed. - sycl::detail::ur::assertion( - hEvent->getReferenceCount() != 0, - "Reference count overflow detected in urEventRelease."); + detail::ur::assertion(hEvent->getReferenceCount() != 0, + "Reference count overflow detected in urEventRelease."); // decrement ref count. If it is 0, delete the event. if (hEvent->decrementReferenceCount() == 0) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp index 8c28dd86fd530..7b9bbf1992a71 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp @@ -58,26 +58,26 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, size_t GlobalWorkSize[3] = {0, 0, 0}; int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0}; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&MaxBlockDimX, hipDeviceAttributeMaxBlockDimX, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&MaxBlockDimY, hipDeviceAttributeMaxBlockDimY, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&MaxBlockDimZ, hipDeviceAttributeMaxBlockDimZ, - hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetAttribute(&MaxBlockDimX, + hipDeviceAttributeMaxBlockDimX, + hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetAttribute(&MaxBlockDimY, + hipDeviceAttributeMaxBlockDimY, + hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetAttribute(&MaxBlockDimZ, + hipDeviceAttributeMaxBlockDimZ, + hDevice->get()) == hipSuccess); int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_grid_dimX, hipDeviceAttributeMaxGridDimX, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_grid_dimY, hipDeviceAttributeMaxGridDimY, - hDevice->get()) == hipSuccess); - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&max_grid_dimZ, hipDeviceAttributeMaxGridDimZ, - hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetAttribute(&max_grid_dimX, + hipDeviceAttributeMaxGridDimX, + hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetAttribute(&max_grid_dimY, + hipDeviceAttributeMaxGridDimY, + hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetAttribute(&max_grid_dimZ, + hipDeviceAttributeMaxGridDimZ, + hDevice->get()) == hipSuccess); GlobalWorkSize[0] = MaxBlockDimX * max_grid_dimX; GlobalWorkSize[1] = MaxBlockDimY * max_grid_dimY; @@ -86,7 +86,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, } case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { int MaxThreads = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipFuncGetAttribute(&MaxThreads, HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, hKernel->get()) == hipSuccess); @@ -105,7 +105,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { // OpenCL LOCAL == HIP SHARED int Bytes = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipFuncGetAttribute(&Bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, hKernel->get()) == hipSuccess); return ReturnValue(uint64_t(Bytes)); @@ -113,15 +113,15 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { // Work groups should be multiples of the warp size int WarpSize = 0; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, - hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetAttribute(&WarpSize, + hipDeviceAttributeWarpSize, + hDevice->get()) == hipSuccess); return ReturnValue(static_cast(WarpSize)); } case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { // OpenCL PRIVATE == HIP LOCAL int Bytes = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipFuncGetAttribute(&Bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get()) == hipSuccess); return ReturnValue(uint64_t(Bytes)); @@ -220,15 +220,15 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: { // Sub-group size is equivalent to warp size int WarpSize = 0; - sycl::detail::ur::assertion( - hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, - hDevice->get()) == hipSuccess); + detail::ur::assertion(hipDeviceGetAttribute(&WarpSize, + hipDeviceAttributeWarpSize, + hDevice->get()) == hipSuccess); return ReturnValue(static_cast(WarpSize)); } case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: { // Number of sub-groups = max block size / warp size + possible remainder int MaxThreads = 0; - sycl::detail::ur::assertion( + detail::ur::assertion( hipFuncGetAttribute(&MaxThreads, HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, hKernel->get()) == hipSuccess); @@ -289,7 +289,7 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, if (Format != HIP_AD_FORMAT_UNSIGNED_INT32 && Format != HIP_AD_FORMAT_SIGNED_INT32 && Format != HIP_AD_FORMAT_HALF && Format != HIP_AD_FORMAT_FLOAT) { - sycl::detail::ur::die( + detail::ur::die( "UR HIP kernels only support images with channel types int32, " "uint32, float, and half."); } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp index db7f716393ec2..06578b5817994 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp @@ -59,8 +59,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { // error for which it is unclear if the function that reported it succeeded // or not. Either way, the state of the program is compromised and likely // unrecoverable. - sycl::detail::ur::die( - "Unrecoverable program state reached in urMemRelease"); + detail::ur::die("Unrecoverable program state reached in urMemRelease"); } return UR_RESULT_SUCCESS; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp index 2b60e225781a7..8c9464aa9a587 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/sampler.cpp @@ -67,7 +67,7 @@ ur_result_t urSamplerRetain(ur_sampler_handle_t hSampler) { ur_result_t urSamplerRelease(ur_sampler_handle_t hSampler) { // double delete or someone is messing with the ref count. // either way, cannot safely proceed. - sycl::detail::ur::assertion( + detail::ur::assertion( hSampler->getReferenceCount() != 0, "Reference count overflow detected in urSamplerRelease."); From 15d9114dfad7960d21f1558df406a1032b28c023 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Mon, 3 Jul 2023 12:54:38 +0100 Subject: [PATCH 36/42] [SYCL][HIP][UR] Add UR headers to hip unittests --- sycl/unittests/pi/hip/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/unittests/pi/hip/CMakeLists.txt b/sycl/unittests/pi/hip/CMakeLists.txt index 5965cbff1a1da..eee75b0447551 100644 --- a/sycl/unittests/pi/hip/CMakeLists.txt +++ b/sycl/unittests/pi/hip/CMakeLists.txt @@ -22,6 +22,7 @@ target_include_directories(PiHipTests "${sycl_inc_dir}/sycl/detail/" "${sycl_inc_dir}" "${sycl_plugin_dir}/hip/" + "${sycl_plugin_dir}/unified_runtime/" ) if("${SYCL_BUILD_PI_HIP_PLATFORM}" STREQUAL "AMD") @@ -37,4 +38,5 @@ endif() target_link_libraries(PiHipTests PRIVATE rocmdrv + UnifiedRuntime-Headers ) From 43f8edcc8a63f829182d71f8204d7d8cdef3f10f Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Wed, 5 Jul 2023 14:19:17 +0100 Subject: [PATCH 37/42] [SYCL][UR][HIP] Port command buffer entries --- sycl/plugins/hip/CMakeLists.txt | 2 + sycl/plugins/hip/pi_hip.cpp | 21 ++ sycl/plugins/hip/pi_hip.hpp | 5 + sycl/plugins/unified_runtime/CMakeLists.txt | 2 + .../ur/adapters/hip/command_buffer.cpp | 250 ++++++++++++++++++ .../ur/adapters/hip/command_buffer.hpp | 13 + 6 files changed, 293 insertions(+) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.hpp diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt index bd354b10ca91f..32d2520af3daa 100644 --- a/sycl/plugins/hip/CMakeLists.txt +++ b/sycl/plugins/hip/CMakeLists.txt @@ -114,6 +114,8 @@ add_sycl_plugin(hip "../unified_runtime/ur/adapters/hip/kernel.hpp" "../unified_runtime/ur/adapters/hip/queue.cpp" "../unified_runtime/ur/adapters/hip/queue.hpp" + "../unified_runtime/ur/adapters/hip/command_buffer.cpp" + "../unified_runtime/ur/adapters/hip/command_buffer.hpp" "../unified_runtime/ur/adapters/hip/ur_interface_loader.cpp" "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index bd9791ee1b696..eecbb705ba691 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -186,6 +186,27 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer) _PI_CL(piPluginGetBackendOption, pi2ur::piPluginGetBackendOption) + // command-buffer + _PI_CL(piextCommandBufferCreate, pi2ur::piextCommandBufferCreate) + _PI_CL(piextCommandBufferRetain, pi2ur::piextCommandBufferRetain) + _PI_CL(piextCommandBufferRelease, pi2ur::piextCommandBufferRelease) + _PI_CL(piextCommandBufferNDRangeKernel, + pi2ur::piextCommandBufferNDRangeKernel) + _PI_CL(piextCommandBufferMemcpyUSM, pi2ur::piextCommandBufferMemcpyUSM) + _PI_CL(piextCommandBufferMemBufferCopy, + pi2ur::piextCommandBufferMemBufferCopy) + _PI_CL(piextCommandBufferMemBufferCopyRect, + pi2ur::piextCommandBufferMemBufferCopyRect) + _PI_CL(piextCommandBufferMemBufferRead, + pi2ur::piextCommandBufferMemBufferRead) + _PI_CL(piextCommandBufferMemBufferReadRect, + pi2ur::piextCommandBufferMemBufferReadRect) + _PI_CL(piextCommandBufferMemBufferWrite, + pi2ur::piextCommandBufferMemBufferWrite) + _PI_CL(piextCommandBufferMemBufferWriteRect, + pi2ur::piextCommandBufferMemBufferWriteRect) + _PI_CL(piextEnqueueCommandBuffer, pi2ur::piextEnqueueCommandBuffer) + #undef _PI_CL return PI_SUCCESS; diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index 7fd71881b7c83..3ab21101228fe 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -39,6 +39,7 @@ #include #include +#include #include #include #include @@ -89,4 +90,8 @@ struct _pi_sampler : ur_sampler_handle_t_ { using ur_sampler_handle_t_::ur_sampler_handle_t_; }; +struct _pi_ext_command_buffer : ur_exp_command_buffer_handle_t_ { + using ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_; +}; + #endif // PI_HIP_HPP diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index f0b790cb36c85..0874d3a111947 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -202,6 +202,8 @@ if ("hip" IN_LIST SYCL_ENABLE_PLUGINS) "ur/adapters/hip/kernel.hpp" "ur/adapters/hip/queue.cpp" "ur/adapters/hip/queue.hpp" + "ur/adapters/hip/command_buffer.hpp" + "ur/adapters/hip/command_buffer.cpp" "ur/adapters/hip/ur_interface_loader.cpp" INCLUDE_DIRS ${sycl_inc_dir} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp new file mode 100644 index 0000000000000..b449c6f743b85 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp @@ -0,0 +1,250 @@ +//===--------- command_buffer.cpp - HIP Adapter ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "command_buffer.hpp" +#include "common.hpp" + +/// Stub implementations of UR experimental feature command-buffers + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_exp_command_buffer_desc_t *pCommandBufferDesc, + ur_exp_command_buffer_handle_t *phCommandBuffer) { + std::ignore = hContext; + std::ignore = hDevice; + std::ignore = pCommandBufferDesc; + std::ignore = phCommandBuffer; + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + std::ignore = hCommandBuffer; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + std::ignore = hCommandBuffer; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + std::ignore = hCommandBuffer; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, + uint32_t workDim, const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + std::ignore = hCommandBuffer; + std::ignore = hKernel; + std::ignore = workDim; + std::ignore = pGlobalWorkOffset; + std::ignore = pGlobalWorkSize; + std::ignore = pLocalWorkSize; + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp( + ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, + size_t size, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + std::ignore = hCommandBuffer; + std::ignore = pDst; + std::ignore = pSrc; + std::ignore = size; + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, + ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + std::ignore = hCommandBuffer; + std::ignore = hSrcMem; + std::ignore = hDstMem; + std::ignore = srcOffset; + std::ignore = dstOffset; + std::ignore = size; + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, + ur_mem_handle_t hDstMem, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, + size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + std::ignore = hCommandBuffer; + std::ignore = hSrcMem; + std::ignore = hDstMem; + std::ignore = srcOrigin; + std::ignore = dstOrigin; + std::ignore = region; + std::ignore = srcRowPitch; + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT +ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + size_t offset, size_t size, const void *pSrc, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + std::ignore = hCommandBuffer; + std::ignore = hBuffer; + std::ignore = offset; + std::ignore = size; + std::ignore = pSrc; + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT +ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + std::ignore = hCommandBuffer; + std::ignore = hBuffer; + std::ignore = offset; + std::ignore = size; + std::ignore = pDst; + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT +ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + std::ignore = hCommandBuffer; + std::ignore = hBuffer; + std::ignore = bufferOffset; + std::ignore = hostOffset; + std::ignore = region; + std::ignore = bufferRowPitch; + std::ignore = bufferSlicePitch; + std::ignore = hostRowPitch; + std::ignore = hostSlicePitch; + std::ignore = pSrc; + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT +ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pDst, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + std::ignore = hCommandBuffer; + std::ignore = hBuffer; + std::ignore = bufferOffset; + std::ignore = hostOffset; + std::ignore = region; + std::ignore = bufferRowPitch; + std::ignore = bufferSlicePitch; + std::ignore = hostRowPitch; + std::ignore = hostSlicePitch; + std::ignore = pDst; + + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + std::ignore = hCommandBuffer; + std::ignore = hQueue; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.hpp new file mode 100644 index 0000000000000..9bcdbfeccf17d --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.hpp @@ -0,0 +1,13 @@ +//===--------- command_buffer.hpp - HIP Adapter ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include + +/// Stub implementation of command-buffers for HIP + +struct ur_exp_command_buffer_handle_t_ {}; From 2cae55a640e18caa9bd2cb49dbc09f2311befeb4 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Wed, 5 Jul 2023 15:17:10 +0100 Subject: [PATCH 38/42] [SYCL][UR][HIP] Ignore unused parameters in command buffer entries --- .../plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp index b449c6f743b85..f6e633e0a2908 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp @@ -127,6 +127,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( std::ignore = dstOrigin; std::ignore = region; std::ignore = srcRowPitch; + std::ignore = srcSlicePitch; + std::ignore = dstRowPitch; + std::ignore = dstSlicePitch; std::ignore = numSyncPointsInWaitList; std::ignore = pSyncPointWaitList; std::ignore = pSyncPoint; From 5ad5f7691619a8c3e04e9ffed2a2a7379a79f320 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Mon, 10 Jul 2023 10:29:07 +0100 Subject: [PATCH 39/42] [SYCL][UR][HIP] Style fixups and add command buffers to ddi --- .../ur/adapters/hip/command_buffer.cpp | 194 ++++-------------- .../ur/adapters/hip/common.cpp | 33 ++- .../ur/adapters/hip/common.hpp | 6 +- .../ur/adapters/hip/context.cpp | 23 +-- .../ur/adapters/hip/device.cpp | 30 +-- .../ur/adapters/hip/device.hpp | 1 - .../ur/adapters/hip/enqueue.cpp | 112 ++-------- .../unified_runtime/ur/adapters/hip/event.cpp | 29 +-- .../unified_runtime/ur/adapters/hip/event.hpp | 3 - .../ur/adapters/hip/kernel.cpp | 58 ++---- .../ur/adapters/hip/kernel.hpp | 5 +- .../ur/adapters/hip/memory.cpp | 34 +-- .../ur/adapters/hip/memory.hpp | 7 +- .../ur/adapters/hip/platform.cpp | 20 +- .../ur/adapters/hip/program.cpp | 41 +--- .../ur/adapters/hip/program.hpp | 1 - .../unified_runtime/ur/adapters/hip/queue.cpp | 18 +- .../ur/adapters/hip/ur_interface_loader.cpp | 27 +++ .../unified_runtime/ur/adapters/hip/usm.cpp | 11 +- 19 files changed, 179 insertions(+), 474 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp index f6e633e0a2908..4a559e33a1273 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/command_buffer.cpp @@ -12,128 +12,67 @@ /// Stub implementations of UR experimental feature command-buffers UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_exp_command_buffer_desc_t *pCommandBufferDesc, - ur_exp_command_buffer_handle_t *phCommandBuffer) { - std::ignore = hContext; - std::ignore = hDevice; - std::ignore = pCommandBufferDesc; - std::ignore = phCommandBuffer; + ur_context_handle_t, ur_device_handle_t, + const ur_exp_command_buffer_desc_t *, ur_exp_command_buffer_handle_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - std::ignore = hCommandBuffer; - +urCommandBufferRetainExp(ur_exp_command_buffer_handle_t) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - std::ignore = hCommandBuffer; - +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - std::ignore = hCommandBuffer; - +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, - uint32_t workDim, const size_t *pGlobalWorkOffset, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - std::ignore = hCommandBuffer; - std::ignore = hKernel; - std::ignore = workDim; - std::ignore = pGlobalWorkOffset; - std::ignore = pGlobalWorkSize; - std::ignore = pLocalWorkSize; - std::ignore = numSyncPointsInWaitList; - std::ignore = pSyncPointWaitList; - std::ignore = pSyncPoint; - + ur_exp_command_buffer_handle_t, ur_kernel_handle_t, uint32_t, + const size_t *, const size_t *, const size_t *, uint32_t, + const ur_exp_command_buffer_sync_point_t *, + ur_exp_command_buffer_sync_point_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp( - ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, - size_t size, uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - std::ignore = hCommandBuffer; - std::ignore = pDst; - std::ignore = pSrc; - std::ignore = size; - std::ignore = numSyncPointsInWaitList; - std::ignore = pSyncPointWaitList; - std::ignore = pSyncPoint; - + ur_exp_command_buffer_handle_t, void *, const void *, size_t, uint32_t, + const ur_exp_command_buffer_sync_point_t *, + ur_exp_command_buffer_sync_point_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, - ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - std::ignore = hCommandBuffer; - std::ignore = hSrcMem; - std::ignore = hDstMem; - std::ignore = srcOffset; - std::ignore = dstOffset; - std::ignore = size; - std::ignore = numSyncPointsInWaitList; - std::ignore = pSyncPointWaitList; - std::ignore = pSyncPoint; - + ur_exp_command_buffer_handle_t, ur_mem_handle_t, ur_mem_handle_t, size_t, + size_t, size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, + ur_exp_command_buffer_sync_point_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, - ur_mem_handle_t hDstMem, ur_rect_offset_t srcOrigin, - ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, - size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - std::ignore = hCommandBuffer; - std::ignore = hSrcMem; - std::ignore = hDstMem; - std::ignore = srcOrigin; - std::ignore = dstOrigin; - std::ignore = region; - std::ignore = srcRowPitch; - std::ignore = srcSlicePitch; - std::ignore = dstRowPitch; - std::ignore = dstSlicePitch; - std::ignore = numSyncPointsInWaitList; - std::ignore = pSyncPointWaitList; - std::ignore = pSyncPoint; - + ur_exp_command_buffer_handle_t, ur_mem_handle_t, ur_mem_handle_t, + ur_rect_offset_t, ur_rect_offset_t, ur_rect_region_t, size_t, size_t, + size_t, size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, + ur_exp_command_buffer_sync_point_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -141,20 +80,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, - size_t offset, size_t size, const void *pSrc, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - std::ignore = hCommandBuffer; - std::ignore = hBuffer; - std::ignore = offset; - std::ignore = size; - std::ignore = pSrc; - std::ignore = numSyncPointsInWaitList; - std::ignore = pSyncPointWaitList; - std::ignore = pSyncPoint; - + ur_exp_command_buffer_handle_t, ur_mem_handle_t, size_t, size_t, + const void *, uint32_t, const ur_exp_command_buffer_sync_point_t *, + ur_exp_command_buffer_sync_point_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -162,19 +90,9 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp( UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, - size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - std::ignore = hCommandBuffer; - std::ignore = hBuffer; - std::ignore = offset; - std::ignore = size; - std::ignore = pDst; - std::ignore = numSyncPointsInWaitList; - std::ignore = pSyncPointWaitList; - std::ignore = pSyncPoint; - + ur_exp_command_buffer_handle_t, ur_mem_handle_t, size_t, size_t, void *, + uint32_t, const ur_exp_command_buffer_sync_point_t *, + ur_exp_command_buffer_sync_point_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -182,27 +100,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp( UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, - ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, - ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, - size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - std::ignore = hCommandBuffer; - std::ignore = hBuffer; - std::ignore = bufferOffset; - std::ignore = hostOffset; - std::ignore = region; - std::ignore = bufferRowPitch; - std::ignore = bufferSlicePitch; - std::ignore = hostRowPitch; - std::ignore = hostSlicePitch; - std::ignore = pSrc; - std::ignore = numSyncPointsInWaitList; - std::ignore = pSyncPointWaitList; - std::ignore = pSyncPoint; - + ur_exp_command_buffer_handle_t, ur_mem_handle_t, ur_rect_offset_t, + ur_rect_offset_t, ur_rect_region_t, size_t, size_t, size_t, size_t, void *, + uint32_t, const ur_exp_command_buffer_sync_point_t *, + ur_exp_command_buffer_sync_point_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -210,43 +111,18 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp( UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, - ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, - ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, - size_t hostRowPitch, size_t hostSlicePitch, void *pDst, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - std::ignore = hCommandBuffer; - std::ignore = hBuffer; - std::ignore = bufferOffset; - std::ignore = hostOffset; - std::ignore = region; - std::ignore = bufferRowPitch; - std::ignore = bufferSlicePitch; - std::ignore = hostRowPitch; - std::ignore = hostSlicePitch; - std::ignore = pDst; - - std::ignore = numSyncPointsInWaitList; - std::ignore = pSyncPointWaitList; - std::ignore = pSyncPoint; - + ur_exp_command_buffer_handle_t, ur_mem_handle_t, ur_rect_offset_t, + ur_rect_offset_t, ur_rect_region_t, size_t, size_t, size_t, size_t, void *, + uint32_t, const ur_exp_command_buffer_sync_point_t *, + ur_exp_command_buffer_sync_point_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - std::ignore = hCommandBuffer; - std::ignore = hQueue; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - + ur_exp_command_buffer_handle_t, ur_queue_handle_t, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for HIP adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp index 41c18b798db07..36740fb0147a4 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.cpp @@ -40,15 +40,12 @@ ur_result_t checkErrorUR(hipError_t Result, const char *Function, int Line, const char *ErrorName = nullptr; ErrorName = hipGetErrorName(Result); ErrorString = hipGetErrorString(Result); - std::stringstream SS; - SS << "\nUR HIP ERROR:" - << "\n\tValue: " << Result - << "\n\tName: " << ErrorName - << "\n\tDescription: " << ErrorString - << "\n\tFunction: " << Function << "\n\tSource Location: " << File - << ":" << Line << "\n" - << std::endl; - std::cerr << SS.str(); + std::cerr << "\nUR HIP ERROR:" + << "\n\tValue: " << Result + << "\n\tName: " << ErrorName + << "\n\tDescription: " << ErrorString + << "\n\tFunction: " << Function + << "\n\tSource Location: " << File << ":" << Line << "\n\n"; } if (std::getenv("PI_HIP_ABORT") != nullptr || @@ -59,19 +56,21 @@ ur_result_t checkErrorUR(hipError_t Result, const char *Function, int Line, throw mapErrorUR(Result); } -std::string getHipVersionString() { +hipError_t getHipVersionString(std::string &Version) { int DriverVersion = 0; - if (hipDriverGetVersion(&DriverVersion) != hipSuccess) { - return ""; + auto Result = hipDriverGetVersion(&DriverVersion); + if (Result != hipSuccess) { + return Result; } // The version is returned as (1000 major + 10 minor). std::stringstream Stream; Stream << "HIP " << DriverVersion / 1000 << "." << DriverVersion % 1000 / 10; - return Stream.str(); + Version = Stream.str(); + return Result; } void detail::ur::die(const char *pMessage) { - std::cerr << "ur_die: " << pMessage << std::endl; + std::cerr << "ur_die: " << pMessage << '\n'; std::terminate(); } @@ -81,7 +80,7 @@ void detail::ur::assertion(bool Condition, const char *pMessage) { } void detail::ur::hipPrint(const char *pMessage) { - std::cerr << "ur_print: " << pMessage << std::endl; + std::cerr << "ur_print: " << pMessage << '\n'; } // Global variables for UR_RESULT_ADAPTER_SPECIFIC_ERROR @@ -91,8 +90,8 @@ thread_local char ErrorMessage[MaxMessageSize]; // Utility function for setting a message and warning [[maybe_unused]] void setErrorMessage(const char *pMessage, ur_result_t ErrorCode) { - assert(strlen(pMessage) <= MaxMessageSize); - strcpy(ErrorMessage, pMessage); + assert(strlen(pMessage) < MaxMessageSize); + strncpy(ErrorMessage, pMessage, MaxMessageSize - 1); ErrorMessageCode = ErrorCode; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp index c3d3a6e23dd18..b1ebaf4c84df1 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/common.hpp @@ -30,7 +30,7 @@ inline void getArrayDesc(hipArray *Array, hipArray_Format &Format, #endif } -// NVidia HIP headers guard hipArray3DCreate behind __CUDACC__, this does not +// HIP on NVIDIA headers guard hipArray3DCreate behind __CUDACC__, this does not // seem to be required and we're not using nvcc to build the UR HIP adapter so // add the translation function here #if defined(__HIP_PLATFORM_NVIDIA__) && !defined(__CUDACC__) @@ -73,7 +73,7 @@ ur_result_t checkErrorUR(hipError_t Result, const char *Function, int Line, #define UR_CHECK_ERROR(result) \ checkErrorUR(result, __func__, __LINE__, __FILE__) -std::string getHipVersionString(); +hipError_t getHipVersionString(std::string &Version); constexpr size_t MaxMessageSize = 256; extern thread_local ur_result_t ErrorMessageCode; @@ -173,4 +173,4 @@ template class ReleaseGuard { /// End the guard and do not release the reference count of the held /// UR object. void dismiss() { Captive = nullptr; } -}; \ No newline at end of file +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp index c3e34df5ee517..fe392e36cc225 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/context.cpp @@ -13,13 +13,10 @@ /// By default creates a scoped context and keeps the last active HIP context /// on top of the HIP context stack. /// -UR_APIEXPORT ur_result_t UR_APICALL -urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, - const ur_context_properties_t *pProperties, - ur_context_handle_t *phContext) { +UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( + uint32_t DeviceCount, const ur_device_handle_t *phDevices, + const ur_context_properties_t *, ur_context_handle_t *phContext) { std::ignore = DeviceCount; - std::ignore = pProperties; - assert(DeviceCount == 1); ur_result_t RetErr = UR_RESULT_SUCCESS; @@ -47,7 +44,7 @@ urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, RetErr); // For non-primary scoped contexts keep the last active on top of the stack - // as `cuCtxCreate` replaces it implicitly otherwise. + // as `hipCtxCreate` replaces it implicitly otherwise. // Primary contexts are kept on top of the stack, so the previous context // is not queried and therefore not recovered. if (Current != nullptr) { @@ -151,16 +148,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( - ur_native_handle_t hNativeContext, uint32_t numDevices, - const ur_device_handle_t *phDevices, - const ur_context_native_properties_t *pProperties, - ur_context_handle_t *phContext) { - std::ignore = hNativeContext; - std::ignore = numDevices; - std::ignore = phDevices; - std::ignore = pProperties; - std::ignore = phContext; - + ur_native_handle_t, uint32_t, const ur_device_handle_t *, + const ur_context_native_properties_t *, ur_context_handle_t *) { return UR_RESULT_ERROR_INVALID_OPERATION; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp index e0fa5e294a641..866819ca3c07f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.cpp @@ -530,8 +530,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue("AMD Corporation"); } case UR_DEVICE_INFO_DRIVER_VERSION: { - auto version = getHipVersionString(); - return ReturnValue(version.c_str()); + std::string Version; + detail::ur::assertion(getHipVersionString(Version) == hipSuccess); + return ReturnValue(Version.c_str()); } case UR_DEVICE_INFO_PROFILE: { return ReturnValue("HIP"); @@ -857,8 +858,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, /// \return UR_RESULT_SUCCESS if the function is executed successfully /// HIP devices are always root devices so retain always returns success. -UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t hDevice) { - std::ignore = hDevice; +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t) { return UR_RESULT_SUCCESS; } @@ -870,9 +870,7 @@ urDevicePartition(ur_device_handle_t, const ur_device_partition_properties_t *, /// \return UR_RESULT_SUCCESS always since HIP devices are always root /// devices. -UR_APIEXPORT ur_result_t UR_APICALL -urDeviceRelease(ur_device_handle_t hDevice) { - std::ignore = hDevice; +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t) { return UR_RESULT_SUCCESS; } @@ -923,25 +921,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, - const ur_device_native_properties_t *pProperties, - ur_device_handle_t *phDevice) { - std::ignore = hNativeDevice; - std::ignore = hPlatform; - std::ignore = pProperties; - std::ignore = phDevice; - + ur_native_handle_t, ur_platform_handle_t, + const ur_device_native_properties_t *, ur_device_handle_t *) { return UR_RESULT_ERROR_INVALID_OPERATION; } /// \return UR_RESULT_SUCCESS If available, the first binary that is PTX /// -UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( - ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries, - uint32_t NumBinaries, uint32_t *pSelectedBinary) { +UR_APIEXPORT ur_result_t UR_APICALL +urDeviceSelectBinary(ur_device_handle_t, const ur_device_binary_t *pBinaries, + uint32_t NumBinaries, uint32_t *pSelectedBinary) { // Ignore unused parameter - std::ignore = hDevice; - UR_ASSERT(NumBinaries > 0, UR_RESULT_ERROR_INVALID_ARGUMENT); // Look for an image for the HIP target, and return the first one that is diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp index 370aaee5424b2..9a56652957663 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/device.hpp @@ -15,7 +15,6 @@ /// Includes an observer pointer to the platform, /// and implements the reference counting semantics since /// HIP objects are not refcounted. -/// struct ur_device_handle_t_ { private: using native_type = hipDevice_t; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp index 770f1d9601eb9..c5042f64bcc7b 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp @@ -73,19 +73,12 @@ void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock, assert(Kernel != nullptr); std::ignore = Kernel; - // int recommendedBlockSize, minGrid; - - // UR_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize( - // &minGrid, &recommendedBlockSize, Kernel->get(), - // 0, 0)); - - //(void)minGrid; // Not used, avoid warnings ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]); // Find a local work group size that is a divisor of the global // work group size to produce uniform work groups. - while (0u != (GlobalWorkSize[0] % ThreadsPerBlock[0])) { + while (GlobalWorkSize[0] % ThreadsPerBlock[0]) { --ThreadsPerBlock[0]; } } @@ -890,11 +883,9 @@ static ur_result_t commonEnqueueMemImageNDCopy( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingRead, - ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, - size_t slicePitch, void *pDst, uint32_t numEventsInWaitList, + ur_rect_offset_t origin, ur_rect_region_t region, size_t, size_t, + void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = rowPitch; - std::ignore = slicePitch; UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); @@ -953,13 +944,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( - ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingWrite, - ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, - size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList, + ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool, + ur_rect_offset_t origin, ur_rect_region_t region, size_t, size_t, + void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = blockingWrite; - std::ignore = rowPitch; - std::ignore = slicePitch; UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); @@ -1338,9 +1326,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, - ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) { - std::ignore = advice; - + ur_usm_advice_flags_t, ur_event_handle_t *phEvent) { void *HIPDevicePtr = const_cast(pMem); unsigned int PointerRangeSize = 0; UR_CHECK_ERROR(hipPointerGetAttribute(&PointerRangeSize, @@ -1354,22 +1340,8 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D( - ur_queue_handle_t hQueue, void *pMem, size_t pitch, size_t patternSize, - const void *pPattern, size_t width, size_t height, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - - std::ignore = hQueue; - std::ignore = pMem; - std::ignore = pitch; - std::ignore = patternSize; - std::ignore = pPattern; - std::ignore = width; - std::ignore = height; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - + ur_queue_handle_t, void *, size_t, size_t, const void *, size_t, size_t, + uint32_t, const ur_event_handle_t *, ur_event_handle_t *phEvent) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -1423,75 +1395,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( - ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, - bool blockingWrite, size_t count, size_t offset, const void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - std::ignore = hQueue; - std::ignore = hProgram; - std::ignore = name; - std::ignore = blockingWrite; - std::ignore = count; - std::ignore = offset; - std::ignore = pSrc; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - + ur_queue_handle_t, ur_program_handle_t, const char *, bool, size_t, size_t, + const void *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( - ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, - bool blockingRead, size_t count, size_t offset, void *pDst, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - std::ignore = hQueue; - std::ignore = hProgram; - std::ignore = name; - std::ignore = blockingRead; - std::ignore = count; - std::ignore = offset; - std::ignore = pDst; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - + ur_queue_handle_t, ur_program_handle_t, const char *, bool, size_t, size_t, + void *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueReadHostPipe( - ur_queue_handle_t hQueue, ur_program_handle_t hProgram, - const char *pipe_symbol, bool blocking, void *pDst, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - std::ignore = hQueue; - std::ignore = hProgram; - std::ignore = pipe_symbol; - std::ignore = blocking; - std::ignore = pDst; - std::ignore = size; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - + ur_queue_handle_t, ur_program_handle_t, const char *, bool, void *, size_t, + uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( - ur_queue_handle_t hQueue, ur_program_handle_t hProgram, - const char *pipe_symbol, bool blocking, void *pSrc, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - std::ignore = hQueue; - std::ignore = hProgram; - std::ignore = pipe_symbol; - std::ignore = blocking; - std::ignore = pSrc; - std::ignore = size; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - + ur_queue_handle_t, ur_program_handle_t, const char *, bool, void *, size_t, + uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp index f5f7daa14d410..93faf2def0ac5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/event.cpp @@ -79,7 +79,7 @@ bool ur_event_handle_t_::isCompleted() const noexcept { } uint64_t ur_event_handle_t_::getQueuedTime() const { - float MiliSeconds = 0.0f; + float MilliSeconds = 0.0f; assert(isStarted()); // hipEventSynchronize waits till the event is ready for call to @@ -87,8 +87,8 @@ uint64_t ur_event_handle_t_::getQueuedTime() const { UR_CHECK_ERROR(hipEventSynchronize(EvStart)); UR_CHECK_ERROR(hipEventSynchronize(EvEnd)); - UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds, EvStart, EvEnd)); - return static_cast(MiliSeconds * 1.0e6); + UR_CHECK_ERROR(hipEventElapsedTime(&MilliSeconds, EvStart, EvEnd)); + return static_cast(MilliSeconds * 1.0e6); } uint64_t ur_event_handle_t_::getStartTime() const { @@ -250,14 +250,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( return {}; } -UR_APIEXPORT ur_result_t UR_APICALL -urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus, - ur_event_callback_t pfnNotify, void *pUserData) { - std::ignore = hEvent; - std::ignore = execStatus; - std::ignore = pfnNotify; - std::ignore = pUserData; - +UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t, + ur_execution_info_t, + ur_event_callback_t, + void *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -314,14 +310,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( /// /// \return UR_RESULT_ERROR_UNSUPPORTED_FEATURE UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( - ur_native_handle_t hNativeEvent, ur_context_handle_t hContext, - const ur_event_native_properties_t *pProperties, - ur_event_handle_t *phEvent) { - - std::ignore = hNativeEvent; - std::ignore = hContext; - std::ignore = pProperties; - std::ignore = phEvent; - + ur_native_handle_t, ur_context_handle_t, + const ur_event_native_properties_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp index 6311c942b4a61..5960f384cdfd5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/event.hpp @@ -61,15 +61,12 @@ struct ur_event_handle_t_ { uint32_t getEventId() const noexcept { return EventId; } // Returns the counter time when the associated command(s) were enqueued - // uint64_t getQueuedTime() const; // Returns the counter time when the associated command(s) started execution - // uint64_t getStartTime() const; // Returns the counter time when the associated command(s) completed - // uint64_t getEndTime() const; // construct a native HIP. This maps closely to the underlying HIP event. diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp index 7b9bbf1992a71..709657ab0c947 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.cpp @@ -158,19 +158,14 @@ urKernelRelease(ur_kernel_handle_t hKernel) { // TODO(ur): Not implemented on hip atm. Also, need to add tests for this // feature. -UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( - ur_kernel_handle_t hKernel, ur_native_handle_t *phNativeKernel) { - std::ignore = hKernel; - std::ignore = phNativeKernel; - +UR_APIEXPORT ur_result_t UR_APICALL +urKernelGetNativeHandle(ur_kernel_handle_t, ur_native_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, - const ur_kernel_arg_value_properties_t *pProperties, - const void *pArgValue) { - std::ignore = pProperties; + const ur_kernel_arg_value_properties_t *, const void *pArgValue) { ur_result_t Result = UR_RESULT_SUCCESS; try { if (pArgValue) { @@ -258,20 +253,16 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, return UR_RESULT_ERROR_INVALID_ENUMERATION; } -UR_APIEXPORT ur_result_t UR_APICALL -urKernelSetArgPointer(ur_kernel_handle_t hKernel, uint32_t argIndex, - const ur_kernel_arg_pointer_properties_t *pProperties, - const void *pArgValue) { - std::ignore = pProperties; +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( + ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_pointer_properties_t *, const void *pArgValue) { hKernel->setKernelArg(argIndex, sizeof(pArgValue), pArgValue); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, - const ur_kernel_arg_mem_obj_properties_t *pProperties, - ur_mem_handle_t hArgValue) { - std::ignore = pProperties; +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( + ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_mem_obj_properties_t *, ur_mem_handle_t hArgValue) { // Below sets kernel arg when zero-sized buffers are handled. // In such case the corresponding memory is null. if (hArgValue == nullptr) { @@ -307,11 +298,9 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, return Result; } -UR_APIEXPORT ur_result_t UR_APICALL -urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, - const ur_kernel_arg_sampler_properties_t *pProperties, - ur_sampler_handle_t hArgValue) { - std::ignore = pProperties; +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler( + ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_sampler_properties_t *, ur_sampler_handle_t hArgValue) { ur_result_t Result = UR_RESULT_SUCCESS; try { uint32_t SamplerProps = hArgValue->Props; @@ -323,27 +312,14 @@ urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, } // A NOP for the HIP backend -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( - ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, size_t propSize, - const ur_kernel_exec_info_properties_t *pProperties, - const void *pPropValue) { - std::ignore = hKernel; - std::ignore = propName; - std::ignore = propSize; - std::ignore = pProperties; - std::ignore = pPropValue; +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetExecInfo(ur_kernel_handle_t, ur_kernel_exec_info_t, size_t, + const ur_kernel_exec_info_properties_t *, const void *) { return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( - ur_native_handle_t hNativeKernel, ur_context_handle_t hContext, - ur_program_handle_t hProgram, - const ur_kernel_native_properties_t *pProperties, - ur_kernel_handle_t *phKernel) { - std::ignore = hNativeKernel; - std::ignore = hContext; - std::ignore = hProgram; - std::ignore = pProperties; - std::ignore = phKernel; + ur_native_handle_t, ur_context_handle_t, ur_program_handle_t, + const ur_kernel_native_properties_t *, ur_kernel_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp index b648f0b9afeee..0e4f3c0ea8bd0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/kernel.hpp @@ -46,7 +46,6 @@ struct ur_kernel_handle_t_ { /// This is not something can be queried from the HIP API /// so there is a hard-coded size (\ref MAX_PARAM_BYTES) /// and a storage. - /// struct arguments { static constexpr size_t MAX_PARAM_BYTES = 4000u; using args_t = std::array; @@ -105,8 +104,8 @@ struct ur_kernel_handle_t_ { AlignedLocalOffset += Alignment - Pad; } - addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), - Size + (AlignedLocalOffset - LocalOffset)); + addArg(Index, sizeof(size_t), (const void *)&AlignedLocalOffset, + Size + AlignedLocalOffset - LocalOffset); } void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp index 06578b5817994..3401b5beff148 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.cpp @@ -297,25 +297,15 @@ urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) { } UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( - ur_native_handle_t hNativeMem, ur_context_handle_t hContext, - const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) { - std::ignore = hNativeMem; - std::ignore = hContext; - std::ignore = pProperties; - std::ignore = phMem; + ur_native_handle_t, ur_context_handle_t, const ur_mem_native_properties_t *, + ur_mem_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( - ur_native_handle_t hNativeMem, ur_context_handle_t hContext, - const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, - const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) { - std::ignore = hNativeMem; - std::ignore = hContext; - std::ignore = pImageFormat; - std::ignore = pImageDesc; - std::ignore = pProperties; - std::ignore = phMem; + ur_native_handle_t, ur_context_handle_t, const ur_image_format_t *, + const ur_image_desc_t *, const ur_mem_native_properties_t *, + ur_mem_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -495,15 +485,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( } /// \TODO Not implemented -UR_APIEXPORT ur_result_t UR_APICALL -urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t ImgInfoType, - size_t propSize, void *pImgInfo, size_t *pPropSizeRet) { - std::ignore = hMemory; - std::ignore = ImgInfoType; - std::ignore = propSize; - std::ignore = pImgInfo; - std::ignore = pPropSizeRet; - +UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t, + ur_image_info_t, size_t, + void *, size_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -511,4 +495,4 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { UR_ASSERT(hMem->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_MEM_OBJECT); hMem->incrementReferenceCount(); return UR_RESULT_SUCCESS; -} \ No newline at end of file +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp index ad1d62641f39a..0219084d8b2c8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/memory.hpp @@ -110,8 +110,7 @@ struct ur_mem_handle_t_ { } /// Detach the allocation from the host memory. - void unmap(void *Ptr) noexcept { - std::ignore = Ptr; + void unmap(void *) noexcept { assert(MapPtr != nullptr); if (MapPtr != HostPtr) { @@ -165,10 +164,8 @@ struct ur_mem_handle_t_ { /// Constructs the UR allocation for an Image object ur_mem_handle_t_(ur_context Ctxt, hipArray *Array, hipSurfaceObject_t Surf, - ur_mem_flags_t MemFlags, ur_mem_type_t ImageType, - void *HostPtr) + ur_mem_flags_t MemFlags, ur_mem_type_t ImageType, void *) : Context{Ctxt}, RefCount{1}, MemType{Type::Surface}, MemFlags{MemFlags} { - std::ignore = HostPtr; Mem.SurfaceMem.Array = Array; Mem.SurfaceMem.ImageType = ImageType; Mem.SurfaceMem.SurfObj = Surf; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp index a1883d80975be..11f8fc55d44ce 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/platform.cpp @@ -11,9 +11,8 @@ hipEvent_t ur_platform_handle_t_::EvBase{nullptr}; UR_APIEXPORT ur_result_t UR_APICALL -urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName, +urPlatformGetInfo(ur_platform_handle_t, ur_platform_info_t propName, size_t propSize, void *pPropValue, size_t *pSizeRet) { - std::ignore = hPlatform; UrReturnHelper ReturnValue(propSize, pPropValue, pSizeRet); switch (propName) { @@ -24,8 +23,9 @@ urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName, case UR_PLATFORM_INFO_PROFILE: return ReturnValue("FULL PROFILE"); case UR_PLATFORM_INFO_VERSION: { - auto version = getHipVersionString(); - return ReturnValue(version.c_str()); + std::string Version; + detail::ur::assertion(getHipVersionString(Version) == hipSuccess); + return ReturnValue(Version.c_str()); } case UR_PLATFORM_INFO_BACKEND: { return ReturnValue(UR_PLATFORM_BACKEND_HIP); @@ -121,9 +121,8 @@ urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms, } } -UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( - ur_platform_handle_t hDriver, ur_api_version_t *pVersion) { - std::ignore = hDriver; +UR_APIEXPORT ur_result_t UR_APICALL +urPlatformGetApiVersion(ur_platform_handle_t, ur_api_version_t *pVersion) { *pVersion = UR_API_VERSION_CURRENT; return UR_RESULT_SUCCESS; } @@ -140,10 +139,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urTearDown(void *) { // Current support is only for optimization options. // Return empty string for cuda. // TODO: Determine correct string to be passed. -UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption( - ur_platform_handle_t hPlatform, const char *pFrontendOption, - const char **ppPlatformOption) { - std::ignore = hPlatform; +UR_APIEXPORT ur_result_t UR_APICALL +urPlatformGetBackendOption(ur_platform_handle_t, const char *pFrontendOption, + const char **ppPlatformOption) { using namespace std::literals; if (pFrontendOption == "-O0"sv || pFrontendOption == "-O1"sv || pFrontendOption == "-O2"sv || pFrontendOption == "-O3"sv || diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp index 80588f10aaa98..a66c444c4d9f8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.cpp @@ -53,7 +53,7 @@ ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) { hipModuleLoadDataEx(&Module, static_cast(Binary), NumberOfOptions, Options, OptionVals)); - const auto Success = (Result == UR_RESULT_SUCCESS); + const bool Success = (Result == UR_RESULT_SUCCESS); BuildStatus = Success ? UR_PROGRAM_BUILD_STATUS_SUCCESS : UR_PROGRAM_BUILD_STATUS_ERROR; @@ -79,7 +79,7 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, size_t length, const ur_program_properties_t *pProperties, ur_program_handle_t *phProgram) { ur_device_handle_t hDevice = hContext->getDevice(); - auto pBinary = reinterpret_cast(pIL); + const auto pBinary = reinterpret_cast(pIL); return urProgramCreateWithBinary(hContext, hDevice, length, pBinary, pProperties, phProgram); @@ -88,7 +88,6 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, /// HIP will handle the PTX/HIPBIN binaries internally through a call to /// hipModuleLoadDataEx. So, urProgramCompile and urProgramBuild are equivalent /// in terms of HIP adapter. \TODO Implement asynchronous compilation -/// UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, const char *pOptions) { @@ -98,11 +97,9 @@ urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, /// Loads the images from a UR program into a hipModule_t that can be /// used later on to extract functions (kernels). /// See \ref ur_program_handle_t for implementation details. -UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, +UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t, ur_program_handle_t hProgram, const char *pOptions) { - std::ignore = hContext; - ur_result_t Result = UR_RESULT_SUCCESS; try { @@ -116,16 +113,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, return Result; } -UR_APIEXPORT ur_result_t UR_APICALL -urProgramLink(ur_context_handle_t hContext, uint32_t count, - const ur_program_handle_t *phPrograms, const char *pOptions, - ur_program_handle_t *phProgram) { - std::ignore = hContext; - std::ignore = count; - std::ignore = phPrograms; - std::ignore = pOptions; - std::ignore = phProgram; - +UR_APIEXPORT ur_result_t UR_APICALL urProgramLink(ur_context_handle_t, uint32_t, + const ur_program_handle_t *, + const char *, + ur_program_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -139,24 +130,16 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count, /// /// \return UR_RESULT_ERROR_UNSUPPORTED_FEATURE UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( - ur_native_handle_t hNativeProgram, ur_context_handle_t hContext, - const ur_program_native_properties_t *pProperties, - ur_program_handle_t *phProgram) { - std::ignore = hNativeProgram; - std::ignore = hContext; - std::ignore = pProperties; - std::ignore = phProgram; - + ur_native_handle_t, ur_context_handle_t, + const ur_program_native_properties_t *, ur_program_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL -urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice, +urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t, ur_program_build_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { // Ignore unused parameter - std::ignore = hDevice; - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { @@ -259,10 +242,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( /// Note: Only supports one device UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, - const uint8_t *pBinary, const ur_program_properties_t *pProperties, + const uint8_t *pBinary, const ur_program_properties_t *, ur_program_handle_t *phProgram) { - std::ignore = pProperties; - UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY); UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), UR_RESULT_ERROR_INVALID_CONTEXT); diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp b/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp index b895c206479d5..9c233dbd99598 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/program.hpp @@ -14,7 +14,6 @@ #include "context.hpp" /// Implementation of UR Program on HIP Module object -/// struct ur_program_handle_t_ { using native_type = hipModule_t; native_type Module; diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp index e8438c8df5c38..19447bcf8ae93 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/queue.cpp @@ -232,8 +232,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { // There is no HIP counterpart for queue flushing and we don't run into the // same problem of having to flush cross-queue dependencies as some of the // other plugins, so it can be left as no-op. -UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { - std::ignore = hQueue; +UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t) { return UR_RESULT_SUCCESS; } @@ -244,10 +243,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { /// /// \return UR_RESULT_SUCCESS UR_APIEXPORT ur_result_t UR_APICALL -urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, +urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *, ur_native_handle_t *phNativeQueue) { - std::ignore = pDesc; - ScopedContext Active(hQueue->getContext()); *phNativeQueue = reinterpret_cast(hQueue->getNextComputeStream()); @@ -267,14 +264,7 @@ urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, /// /// \return UR_RESULT_ERROR_UNSUPPORTED_FEATURE UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( - ur_native_handle_t hNativeQueue, ur_context_handle_t hContext, - ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties, - ur_queue_handle_t *phQueue) { - - std::ignore = hNativeQueue; - std::ignore = hContext; - std::ignore = hDevice; - std::ignore = pProperties; - std::ignore = phQueue; + ur_native_handle_t, ur_context_handle_t, ur_device_handle_t, + const ur_queue_native_properties_t *, ur_queue_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index 6d4a8eadda747..15105bb472e71 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -259,6 +259,33 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( return UR_RESULT_SUCCESS; } +UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( + ur_api_version_t version, ur_command_buffer_exp_dditable_t *pDdiTable) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + pDdiTable->pfnCreateExp = urCommandBufferCreateExp; + pDdiTable->pfnRetainExp = urCommandBufferRetainExp; + pDdiTable->pfnReleaseExp = urCommandBufferReleaseExp; + pDdiTable->pfnFinalizeExp = urCommandBufferFinalizeExp; + pDdiTable->pfnAppendKernelLaunchExp = urCommandBufferAppendKernelLaunchExp; + pDdiTable->pfnAppendMemcpyUSMExp = urCommandBufferAppendMemcpyUSMExp; + pDdiTable->pfnAppendMembufferCopyExp = urCommandBufferAppendMembufferCopyExp; + pDdiTable->pfnAppendMembufferCopyRectExp = + urCommandBufferAppendMembufferCopyRectExp; + pDdiTable->pfnAppendMembufferReadExp = urCommandBufferAppendMembufferReadExp; + pDdiTable->pfnAppendMembufferReadRectExp = + urCommandBufferAppendMembufferReadRectExp; + pDdiTable->pfnAppendMembufferWriteExp = + urCommandBufferAppendMembufferWriteExp; + pDdiTable->pfnAppendMembufferWriteRectExp = + urCommandBufferAppendMembufferWriteRectExp; + pDdiTable->pfnEnqueueExp = urCommandBufferEnqueueExp; + + return retVal; +} + #if defined(__cplusplus) } // extern "C" #endif diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp index daca6c6061c09..8be25d0128612 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/usm.cpp @@ -40,11 +40,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( /// USM: Implements USM device allocations using a normal HIP device pointer UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( - ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_context_handle_t hContext, ur_device_handle_t, const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - std::ignore = hDevice; - UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), UR_RESULT_ERROR_INVALID_VALUE); @@ -67,11 +65,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( /// USM: Implements USM Shared allocations using HIP Managed Memory UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( - ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_context_handle_t hContext, ur_device_handle_t, const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - std::ignore = hDevice; - UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), UR_RESULT_ERROR_INVALID_VALUE); @@ -83,9 +79,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( } catch (ur_result_t Error) { Result = Error; } - UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 || - ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)), - UR_RESULT_ERROR_INVALID_VALUE); if (Result == UR_RESULT_SUCCESS) { assert((!pUSMDesc || pUSMDesc->align == 0 || From 9e6f7962876c027e801608d43fd25f88ac3f81d6 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Tue, 11 Jul 2023 11:01:00 +0100 Subject: [PATCH 40/42] [SYCL][UR][HIP] Add usm p2p entry points --- sycl/plugins/hip/CMakeLists.txt | 1 + sycl/plugins/hip/pi_hip.cpp | 42 ++----------------- sycl/plugins/unified_runtime/CMakeLists.txt | 1 + .../ur/adapters/hip/ur_interface_loader.cpp | 13 ++++++ .../ur/adapters/hip/usm_p2p.cpp | 31 ++++++++++++++ 5 files changed, 49 insertions(+), 39 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/hip/usm_p2p.cpp diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt index 32d2520af3daa..6f9bcbcd71818 100644 --- a/sycl/plugins/hip/CMakeLists.txt +++ b/sycl/plugins/hip/CMakeLists.txt @@ -116,6 +116,7 @@ add_sycl_plugin(hip "../unified_runtime/ur/adapters/hip/queue.hpp" "../unified_runtime/ur/adapters/hip/command_buffer.cpp" "../unified_runtime/ur/adapters/hip/command_buffer.hpp" + "../unified_runtime/ur/adapters/hip/usm_p2p.cpp" "../unified_runtime/ur/adapters/hip/ur_interface_loader.cpp" "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 7dbdc72493d04..82a7268d53326 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -30,42 +30,6 @@ //-- PI API implementation extern "C" { -pi_result hip_piextEnablePeerAccess(pi_device command_device, - pi_device peer_device) { - - std::ignore = command_device; - std::ignore = peer_device; - - setErrorMessage("piextEnablePeerAccess not " - "implemented in hip backend", - PI_ERROR_PLUGIN_SPECIFIC_ERROR); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; -} - -pi_result hip_piextDisablePeerAccess(pi_device command_device, - pi_device peer_device) { - - std::ignore = command_device; - std::ignore = peer_device; - - setErrorMessage("piextDisablePeerAccess not " - "implemented in hip backend", - PI_ERROR_PLUGIN_SPECIFIC_ERROR); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; -} - -pi_result hip_piextPeerAccessGetInfo(pi_device command_device, - pi_device peer_device, pi_peer_attr attr, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - std::ignore = command_device; - std::ignore = peer_device; - std::ignore = attr; - // Zero return value indicates that all of the queries currently return false. - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_int32{0}); -} - const char SupportedVersion[] = _PI_HIP_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { @@ -244,9 +208,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextEnqueueCommandBuffer, pi2ur::piextEnqueueCommandBuffer) // Peer to Peer - _PI_CL(piextEnablePeerAccess, hip_piextEnablePeerAccess) - _PI_CL(piextDisablePeerAccess, hip_piextDisablePeerAccess) - _PI_CL(piextPeerAccessGetInfo, hip_piextPeerAccessGetInfo) + _PI_CL(piextEnablePeerAccess, pi2ur::piextEnablePeerAccess) + _PI_CL(piextDisablePeerAccess, pi2ur::piextDisablePeerAccess) + _PI_CL(piextPeerAccessGetInfo, pi2ur::piextPeerAccessGetInfo) #undef _PI_CL diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 833c3463e242b..fb6e850b7329f 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -206,6 +206,7 @@ if ("hip" IN_LIST SYCL_ENABLE_PLUGINS) "ur/adapters/hip/queue.hpp" "ur/adapters/hip/command_buffer.hpp" "ur/adapters/hip/command_buffer.cpp" + "ur/adapters/hip/usm_p2p.cpp" "ur/adapters/hip/ur_interface_loader.cpp" INCLUDE_DIRS ${sycl_inc_dir} diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp index 15105bb472e71..580b9916fb485 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/ur_interface_loader.cpp @@ -286,6 +286,19 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( return retVal; } +UR_DLLEXPORT ur_result_t UR_APICALL urGetUsmP2PExpProcAddrTable( + ur_api_version_t version, ur_usm_p2p_exp_dditable_t *pDdiTable) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + pDdiTable->pfnEnablePeerAccessExp = urUsmP2PEnablePeerAccessExp; + pDdiTable->pfnDisablePeerAccessExp = urUsmP2PDisablePeerAccessExp; + pDdiTable->pfnPeerAccessGetInfoExp = urUsmP2PPeerAccessGetInfoExp; + + return retVal; +} + #if defined(__cplusplus) } // extern "C" #endif diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/usm_p2p.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/usm_p2p.cpp new file mode 100644 index 0000000000000..6de6f82e73007 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/usm_p2p.cpp @@ -0,0 +1,31 @@ +//===--------- usm_p2p.cpp - HIP Adapter---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------===// + +#include "common.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL +urUsmP2PEnablePeerAccessExp(ur_device_handle_t, ur_device_handle_t) { + detail::ur::die( + "urUsmP2PEnablePeerAccessExp is not implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urUsmP2PDisablePeerAccessExp(ur_device_handle_t, ur_device_handle_t) { + detail::ur::die( + "urUsmP2PDisablePeerAccessExp is not implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urUsmP2PPeerAccessGetInfoExp(ur_device_handle_t, ur_device_handle_t, + ur_exp_peer_info_t, size_t, void *, size_t *) { + detail::ur::die( + "urUsmP2PPeerAccessGetInfoExp is not implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} From 743c220283b0c8da84876379f3964fbb417f04e6 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Tue, 11 Jul 2023 15:16:54 +0100 Subject: [PATCH 41/42] [SYCL][UR][HIP] Change urUsmP2PPeerAccessGetInfoExp to return zero --- .../unified_runtime/ur/adapters/hip/usm_p2p.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/hip/usm_p2p.cpp b/sycl/plugins/unified_runtime/ur/adapters/hip/usm_p2p.cpp index 6de6f82e73007..aefcf4755558a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/hip/usm_p2p.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/hip/usm_p2p.cpp @@ -22,10 +22,10 @@ urUsmP2PDisablePeerAccessExp(ur_device_handle_t, ur_device_handle_t) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL -urUsmP2PPeerAccessGetInfoExp(ur_device_handle_t, ur_device_handle_t, - ur_exp_peer_info_t, size_t, void *, size_t *) { - detail::ur::die( - "urUsmP2PPeerAccessGetInfoExp is not implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( + ur_device_handle_t, ur_device_handle_t, ur_exp_peer_info_t, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + // Zero return value indicates that all of the queries currently return false. + return ReturnValue(uint32_t{0}); } From e587f645bb2e306a104ba539deb32c2c20dbc6a6 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Thu, 13 Jul 2023 11:05:21 +0100 Subject: [PATCH 42/42] [SYCL][UR][HIP] remove piEnqueueNativeKernel --- sycl/plugins/hip/pi_hip.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 82a7268d53326..4338efa69a9a5 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -140,7 +140,6 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piSamplerRelease, pi2ur::piSamplerRelease) // Enqueue commands _PI_CL(piEnqueueKernelLaunch, pi2ur::piEnqueueKernelLaunch) - _PI_CL(piEnqueueNativeKernel, pi2ur::piEnqueueNativeKernel) _PI_CL(piEnqueueEventsWait, pi2ur::piEnqueueEventsWait) _PI_CL(piEnqueueEventsWaitWithBarrier, pi2ur::piEnqueueEventsWaitWithBarrier) _PI_CL(piEnqueueMemBufferRead, pi2ur::piEnqueueMemBufferRead)