From 2b4ef1fba8fc9bedb0be9fec725ecd33e0bbbae2 Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Fri, 2 May 2025 13:57:35 +0000 Subject: [PATCH 1/2] Allow 0 as min/max number of waves per EU --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 3 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 24 ++++++++---- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 4 +- .../AMDGPU/attr-amdgpu-waves-per-eu.ll | 38 +++++++++++++++++++ .../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 28 +++++++------- 5 files changed, 72 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index b9ce8dc0c5cdb..0bbbe766968fc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1125,8 +1125,7 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { indicateOptimisticFixpoint(); }; - std::pair MaxWavesPerEURange{ - 1U, InfoCache.getMaxWavesPerEU(*F)}; + std::pair MaxWavesPerEURange{0, 0}; // If the attribute exists, we will honor it if it is not the default. if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 563605f964cc6..4212d97eb9404 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -191,17 +191,25 @@ std::pair AMDGPUSubtarget::getEffectiveWavesPerEU( getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second}; Default.first = std::min(Default.first, Default.second); - // Make sure requested minimum is less than requested maximum. + if (RequestedWavesPerEU.first) { + // Requested minimum must not violate subtarget's specifications. + if (RequestedWavesPerEU.first < Default.first) + return Default; + // Requested maximum must be no lesser than minimum. + if (RequestedWavesPerEU.second && + RequestedWavesPerEU.first > RequestedWavesPerEU.second) + return Default; + } + // Requested maximum must not violate subtarget's specifications. if (RequestedWavesPerEU.second && - RequestedWavesPerEU.first > RequestedWavesPerEU.second) - return Default; - - // Make sure requested values do not violate subtarget's specifications and - // are compatible with values implied by minimum/maximum flat workgroup sizes. - if (RequestedWavesPerEU.first < Default.first || RequestedWavesPerEU.second > Default.second) return Default; + // Replace unspecified bounds in the request with the default bounds. + if (!RequestedWavesPerEU.first) + RequestedWavesPerEU.first = Default.first; + if (!RequestedWavesPerEU.second) + RequestedWavesPerEU.second = Default.second; return RequestedWavesPerEU; } @@ -220,7 +228,7 @@ std::pair AMDGPUSubtarget::getWavesPerEU(std::pair FlatWorkGroupSizes, unsigned LDSBytes, const Function &F) const { // Default minimum/maximum number of waves per execution unit. - std::pair Default(1, getMaxWavesPerEU()); + std::pair Default(0, 0); // Requested minimum/maximum number of waves per execution unit. std::pair Requested = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 91fe2a69bc0b7..1cbb6a7b1ad43 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -119,7 +119,9 @@ class AMDGPUSubtarget { /// Returns the target minimum/maximum number of waves per EU. This is based /// on the minimum/maximum number of \p RequestedWavesPerEU and further /// limited by the maximum achievable occupancy derived from the range of \p - /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. + /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A + /// minimum/maximum requested waves/EU value of 0 indicates an intent to not + /// restrict the corresponding bound. std::pair getEffectiveWavesPerEU(std::pair RequestedWavesPerEU, std::pair FlatWorkGroupSizes, diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 4507fd5865989..d8827d0405295 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -200,3 +200,41 @@ entry: ret void } attributes #10 = {"amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="2,2"} + +; At most 2 waves per execution unit. +; CHECK-LABEL: {{^}}empty_at_most_2: +; CHECK: SGPRBlocks: 12 +; CHECK: VGPRBlocks: 21 +; CHECK: NumSGPRsForWavesPerEU: 102 +; CHECK: NumVGPRsForWavesPerEU: 85 +define amdgpu_kernel void @empty_at_most_2() #11 { +entry: + ret void +} +attributes #11 = {"amdgpu-waves-per-eu"="0,2"} + +; Exactly 1024 workitems (limits occupancy to 8) and at least 5 waves per execution unit. +; "amdgpu-waves-per-eu"="5,0" should have the same effect as "amdgpu-waves-per-eu"="5". +; CHECK-LABEL: {{^}}empty_workitems_exactly_1024_waves_at_least_5: +; CHECK: SGPRBlocks: 8 +; CHECK: VGPRBlocks: 7 +; CHECK: NumSGPRsForWavesPerEU: 65 +; CHECK: NumVGPRsForWavesPerEU: 29 +define amdgpu_kernel void @empty_workitems_exactly_1024_waves_at_least_5() #12 { +entry: + ret void +} +attributes #12 = {"amdgpu-waves-per-eu"="5,0" "amdgpu-flat-work-group-size"="1024,1024"} + +; Unrestricted number of waves per execution unit. +; "amdgpu-waves-per-eu"="0,0" should have the same effect as not providing the attribute. +; CHECK-LABEL: {{^}}empty_default_waves: +; CHECK: SGPRBlocks: 0 +; CHECK: VGPRBlocks: 0 +; CHECK: NumSGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 1 +define amdgpu_kernel void @empty_default_waves() #13 { +entry: + ret void +} +attributes #13 = {"amdgpu-waves-per-eu"="0,0"} diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index ae114f3213d8f..967cc764ea19c 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s -; Check propagation of amdgpu-flat-work-group-size attribute. +; Check propagation of amdgpu-waves-per-eu attribute. ; Called from a single kernel with 1,8 define internal void @default_to_1_8_a() { @@ -216,30 +216,30 @@ define internal i32 @bitcasted_function() { ret i32 0 } -define internal void @called_from_invalid_bounds_0() { -; CHECK-LABEL: define internal void @called_from_invalid_bounds_0 -; CHECK-SAME: () #[[ATTR10:[0-9]+]] { +define internal void @called_without_min_waves() { +; CHECK-LABEL: define internal void @called_without_min_waves +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: ret void ; ret void } -define internal void @called_from_invalid_bounds_1() { -; CHECK-LABEL: define internal void @called_from_invalid_bounds_1 -; CHECK-SAME: () #[[ATTR10]] { +define internal void @called_from_invalid_bounds() { +; CHECK-LABEL: define internal void @called_from_invalid_bounds +; CHECK-SAME: () #[[ATTR10:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void } -; Invalid range for amdgpu-waves-per-eu -define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 { -; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8 +; amdgpu-waves-per-eu range only provides a maximum. +define amdgpu_kernel void @kernel_0_8() #9 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_0_8 ; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: call void @called_from_invalid_bounds_0() +; CHECK-NEXT: call void @called_without_min_waves() ; CHECK-NEXT: ret void ; - call void @called_from_invalid_bounds_0() + call void @called_without_min_waves() ret void } @@ -247,10 +247,10 @@ define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 { define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123 ; CHECK-SAME: () #[[ATTR11:[0-9]+]] { -; CHECK-NEXT: call void @called_from_invalid_bounds_1() +; CHECK-NEXT: call void @called_from_invalid_bounds() ; CHECK-NEXT: ret void ; - call void @called_from_invalid_bounds_1() + call void @called_from_invalid_bounds() ret void } From 107194b6f680004c637c3702189346b3b7eac9c5 Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Fri, 2 May 2025 14:29:38 +0000 Subject: [PATCH 2/2] clang-format --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 4212d97eb9404..a966e485e618e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -201,8 +201,7 @@ std::pair AMDGPUSubtarget::getEffectiveWavesPerEU( return Default; } // Requested maximum must not violate subtarget's specifications. - if (RequestedWavesPerEU.second && - RequestedWavesPerEU.second > Default.second) + if (RequestedWavesPerEU.second && RequestedWavesPerEU.second > Default.second) return Default; // Replace unspecified bounds in the request with the default bounds.