Skip to content

Commit d55e389

Browse files
[amdgpu][openmp] Avoiding writing to packet header twice
1 parent 3651f37 commit d55e389

File tree

2 files changed

+16
-15
lines changed
  • libc/utils/gpu/loader/amdgpu
  • openmp/libomptarget/plugins-nextgen/amdgpu/src

2 files changed

+16
-15
lines changed

libc/utils/gpu/loader/amdgpu/Loader.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -222,34 +222,35 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
222222
// Set up the packet for exeuction on the device. We currently only launch
223223
// with one thread on the device, forcing the rest of the wavefront to be
224224
// masked off.
225-
std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
226-
packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
227-
(params.num_blocks_z * params.num_threads_z != 1))
228-
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
225+
uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
226+
(params.num_blocks_z * params.num_threads_z != 1))
227+
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
229228
packet->workgroup_size_x = params.num_threads_x;
230229
packet->workgroup_size_y = params.num_threads_y;
231230
packet->workgroup_size_z = params.num_threads_z;
231+
packet->reserved0 = 0;
232232
packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
233233
packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
234234
packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
235235
packet->private_segment_size = private_size;
236236
packet->group_segment_size = group_size;
237237
packet->kernel_object = kernel;
238238
packet->kernarg_address = args;
239-
239+
packet->reserved2 = 0;
240240
// Create a signal to indicate when this packet has been completed.
241241
if (hsa_status_t err =
242242
hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
243243
handle_error(err);
244244

245245
// Initialize the packet header and set the doorbell signal to begin execution
246246
// by the HSA runtime.
247-
uint16_t setup = packet->setup;
248247
uint16_t header =
249248
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
250249
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
251250
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
252-
__atomic_store_n(&packet->header, header | (setup << 16), __ATOMIC_RELEASE);
251+
uint32_t header_word =
252+
header | (setup << 16u) __atomic_store_n((uint32_t *)&packet->header,
253+
header_word, __ATOMIC_RELEASE);
253254
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
254255

255256
// Wait until the kernel has completed execution on the device. Periodically

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -650,8 +650,8 @@ struct AMDGPUQueueTy {
650650
hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
651651
assert(Packet && "Invalid packet");
652652

653-
// The header of the packet is written in the last moment.
654-
Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
653+
// The first 32 bits of the packet are written after the other fields
654+
uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
655655
Packet->workgroup_size_x = NumThreads;
656656
Packet->workgroup_size_y = 1;
657657
Packet->workgroup_size_z = 1;
@@ -667,7 +667,7 @@ struct AMDGPUQueueTy {
667667
Packet->completion_signal = OutputSignal->get();
668668

669669
// Publish the packet. Do not modify the packet after this point.
670-
publishKernelPacket(PacketId, Packet);
670+
publishKernelPacket(PacketId, Setup, Packet);
671671

672672
return Plugin::success();
673673
}
@@ -744,17 +744,17 @@ struct AMDGPUQueueTy {
744744
/// Publish the kernel packet so that the HSA runtime can start processing
745745
/// the kernel launch. Do not modify the packet once this function is called.
746746
/// Assumes the queue lock is acquired.
747-
void publishKernelPacket(uint64_t PacketId,
747+
void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
748748
hsa_kernel_dispatch_packet_t *Packet) {
749749
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
750750

751-
uint16_t Setup = Packet->setup;
752751
uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
753752
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
754753
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
755754

756755
// Publish the packet. Do not modify the package after this point.
757-
__atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE);
756+
uint32_t HeaderWord = Header | (Setup << 16u);
757+
__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
758758

759759
// Signal the doorbell about the published packet.
760760
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
@@ -766,14 +766,14 @@ struct AMDGPUQueueTy {
766766
void publishBarrierPacket(uint64_t PacketId,
767767
hsa_barrier_and_packet_t *Packet) {
768768
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
769-
770769
uint16_t Setup = 0;
771770
uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
772771
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
773772
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
774773

775774
// Publish the packet. Do not modify the package after this point.
776-
__atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE);
775+
uint32_t HeaderWord = Header | (Setup << 16u);
776+
__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
777777

778778
// Signal the doorbell about the published packet.
779779
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);

0 commit comments

Comments
 (0)