diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp index 1d0247a6dc5dc..c2a11fd8aab72 100644 --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -222,13 +222,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, // Set up the packet for exeuction on the device. We currently only launch // with one thread on the device, forcing the rest of the wavefront to be // masked off. - std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t)); - packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) + - (params.num_blocks_z * params.num_threads_z != 1)) - << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) + + (params.num_blocks_z * params.num_threads_z != 1)) + << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; packet->workgroup_size_x = params.num_threads_x; packet->workgroup_size_y = params.num_threads_y; packet->workgroup_size_z = params.num_threads_z; + packet->reserved0 = 0; packet->grid_size_x = params.num_blocks_x * params.num_threads_x; packet->grid_size_y = params.num_blocks_y * params.num_threads_y; packet->grid_size_z = params.num_blocks_z * params.num_threads_z; @@ -236,7 +236,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, packet->group_segment_size = group_size; packet->kernel_object = kernel; packet->kernarg_address = args; - + packet->reserved2 = 0; // Create a signal to indicate when this packet has been completed. if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) @@ -244,12 +244,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, // Initialize the packet header and set the doorbell signal to begin execution // by the HSA runtime. - uint16_t setup = packet->setup; uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); - __atomic_store_n(&packet->header, header | (setup << 16), __ATOMIC_RELEASE); + uint32_t header_word = + header | (setup << 16u) __atomic_store_n((uint32_t *)&packet->header, + header_word, __ATOMIC_RELEASE); hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); // Wait until the kernel has completed execution on the device. Periodically diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index ab24856f9bc78..1d6cb0625d80b 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -650,8 +650,8 @@ struct AMDGPUQueueTy { hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId); assert(Packet && "Invalid packet"); - // The header of the packet is written in the last moment. - Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + // The first 32 bits of the packet are written after the other fields + uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; Packet->workgroup_size_x = NumThreads; Packet->workgroup_size_y = 1; Packet->workgroup_size_z = 1; @@ -667,7 +667,7 @@ struct AMDGPUQueueTy { Packet->completion_signal = OutputSignal->get(); // Publish the packet. Do not modify the packet after this point. - publishKernelPacket(PacketId, Packet); + publishKernelPacket(PacketId, Setup, Packet); return Plugin::success(); } @@ -744,17 +744,17 @@ struct AMDGPUQueueTy { /// Publish the kernel packet so that the HSA runtime can start processing /// the kernel launch. Do not modify the packet once this function is called. /// Assumes the queue lock is acquired. - void publishKernelPacket(uint64_t PacketId, + void publishKernelPacket(uint64_t PacketId, uint16_t Setup, hsa_kernel_dispatch_packet_t *Packet) { uint32_t *PacketPtr = reinterpret_cast(Packet); - uint16_t Setup = Packet->setup; uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; // Publish the packet. Do not modify the package after this point. - __atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE); + uint32_t HeaderWord = Header | (Setup << 16u); + __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE); // Signal the doorbell about the published packet. hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); @@ -766,14 +766,14 @@ struct AMDGPUQueueTy { void publishBarrierPacket(uint64_t PacketId, hsa_barrier_and_packet_t *Packet) { uint32_t *PacketPtr = reinterpret_cast(Packet); - uint16_t Setup = 0; uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; // Publish the packet. Do not modify the package after this point. - __atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE); + uint32_t HeaderWord = Header | (Setup << 16u); + __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE); // Signal the doorbell about the published packet. hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);