@@ -650,8 +650,8 @@ struct AMDGPUQueueTy {
650
650
hsa_kernel_dispatch_packet_t *Packet = acquirePacket (PacketId);
651
651
assert (Packet && " Invalid packet" );
652
652
653
- // The header of the packet is written in the last moment.
654
- Packet-> setup = UINT16_C (1 ) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
653
+ // The first 32 bits of the packet are written after the other fields
654
+ uint16_t Setup = UINT16_C (1 ) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
655
655
Packet->workgroup_size_x = NumThreads;
656
656
Packet->workgroup_size_y = 1 ;
657
657
Packet->workgroup_size_z = 1 ;
@@ -667,7 +667,7 @@ struct AMDGPUQueueTy {
667
667
Packet->completion_signal = OutputSignal->get ();
668
668
669
669
// Publish the packet. Do not modify the packet after this point.
670
- publishKernelPacket (PacketId, Packet);
670
+ publishKernelPacket (PacketId, Setup, Packet);
671
671
672
672
return Plugin::success ();
673
673
}
@@ -744,17 +744,17 @@ struct AMDGPUQueueTy {
744
744
// / Publish the kernel packet so that the HSA runtime can start processing
745
745
// / the kernel launch. Do not modify the packet once this function is called.
746
746
// / Assumes the queue lock is acquired.
747
- void publishKernelPacket (uint64_t PacketId,
747
+ void publishKernelPacket (uint64_t PacketId, uint16_t Setup,
748
748
hsa_kernel_dispatch_packet_t *Packet) {
749
749
uint32_t *PacketPtr = reinterpret_cast <uint32_t *>(Packet);
750
750
751
- uint16_t Setup = Packet->setup ;
752
751
uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
753
752
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
754
753
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
755
754
756
755
// Publish the packet. Do not modify the package after this point.
757
- __atomic_store_n (PacketPtr, Header | (Setup << 16 ), __ATOMIC_RELEASE);
756
+ uint32_t HeaderWord = Header | (Setup << 16u );
757
+ __atomic_store_n (PacketPtr, HeaderWord, __ATOMIC_RELEASE);
758
758
759
759
// Signal the doorbell about the published packet.
760
760
hsa_signal_store_relaxed (Queue->doorbell_signal , PacketId);
@@ -766,14 +766,14 @@ struct AMDGPUQueueTy {
766
766
void publishBarrierPacket (uint64_t PacketId,
767
767
hsa_barrier_and_packet_t *Packet) {
768
768
uint32_t *PacketPtr = reinterpret_cast <uint32_t *>(Packet);
769
-
770
769
uint16_t Setup = 0 ;
771
770
uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
772
771
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
773
772
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
774
773
775
774
// Publish the packet. Do not modify the package after this point.
776
- __atomic_store_n (PacketPtr, Header | (Setup << 16 ), __ATOMIC_RELEASE);
775
+ uint32_t HeaderWord = Header | (Setup << 16u );
776
+ __atomic_store_n (PacketPtr, HeaderWord, __ATOMIC_RELEASE);
777
777
778
778
// Signal the doorbell about the published packet.
779
779
hsa_signal_store_relaxed (Queue->doorbell_signal , PacketId);
0 commit comments