llvm · lucas-rami · May 22, 2025 · May 22, 2025 · Jun 27, 2025 · arsenm
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -743,20 +743,21 @@ void CodeGenModule::handleAMDGPUWavesPerEUAttr(
     llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
   unsigned Min =
       Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
-  unsigned Max =
-      Attr->getMax()
-          ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
-          : 0;
 
-  if (Min != 0) {
-    assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
-
-    std::string AttrVal = llvm::utostr(Min);
-    if (Max != 0)
-      AttrVal = AttrVal + "," + llvm::utostr(Max);
-    F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
-  } else
-    assert(Max == 0 && "Max must be zero");
+  if (Attr->getMax()) {
+    unsigned Max =
+        Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
+    assert(Min == 0 || (Min != 0 && Max != 0) &&
+                           "Min must be non-zero when Max is non-zero");
+    assert(Min <= Max && "Min must be less than or equal to Max");
+    // Do not add the attribute if min,max=0,0.
+    if (Max != 0) {
+      std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
+      F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
+    }
+  } else if (Min != 0) {
+    F->addFnAttr("amdgpu-waves-per-eu", llvm::utostr(Min));
+  }
 }
 
 std::unique_ptr<TargetCodeGenInfo>

diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -244,11 +244,6 @@ static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
   if (MaxExpr && !S.checkUInt32Argument(Attr, MaxExpr, Max, 1))
     return true;
 
-  if (Min == 0 && Max != 0) {
-    S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
-        << &Attr << 0;
-    return true;
-  }
   if (Max != 0 && Min > Max) {
     S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
         << &Attr << 1;

diff --git a/clang/test/CodeGenHIP/amdgpu-waves-per-eu.hip b/clang/test/CodeGenHIP/amdgpu-waves-per-eu.hip
@@ -0,0 +1,53 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device -emit-llvm -o - %s | FileCheck %s
+
+// COM: Most tests are in the OpenCL semastics, this is just a verification for HIP
+
+#define __global__ __attribute__((global))
+
+//.
+// CHECK: @__hip_cuid_ = addrspace(1) global i8 0
+// CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata"
+//.
+// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@_Z21kernel_waves_per_eu_0v
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+__global__ __attribute__((amdgpu_waves_per_eu(0))) void kernel_waves_per_eu_0() {}
+
+// Equivalent to kernel_waves_per_eu_0.
+// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_0_0v
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+__global__ __attribute__((amdgpu_waves_per_eu(0, 0))) void kernel_waves_per_eu_0_0() {}
+
+// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_0_4v
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+__global__ __attribute__((amdgpu_waves_per_eu(0, 4))) void kernel_waves_per_eu_0_4() {}
+
+// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_1_4v
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+__global__ __attribute__((amdgpu_waves_per_eu(1, 4))) void kernel_waves_per_eu_1_4() {}
+//.
+// CHECK: attributes #[[ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="0,4" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// CHECK: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="1,4" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// CHECK: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+// CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+//.
diff --git a/clang/test/SemaOpenCL/amdgpu-attrs.cl b/clang/test/SemaOpenCL/amdgpu-attrs.cl
@@ -46,7 +46,6 @@ __attribute__((amdgpu_num_sgpr(4294967296))) kernel void kernel_num_sgpr_L() {}
 __attribute__((amdgpu_num_vgpr(4294967296))) kernel void kernel_num_vgpr_L() {} // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
 
 __attribute__((amdgpu_flat_work_group_size(0, 64))) kernel void kernel_flat_work_group_size_0_64() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: max must be 0 since min is 0}}
-__attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: max must be 0 since min is 0}}
 
 __attribute__((amdgpu_flat_work_group_size(64, 32))) kernel void kernel_flat_work_group_size_64_32() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: min must not be greater than max}}
 __attribute__((amdgpu_waves_per_eu(4, 2))) kernel void kernel_waves_per_eu_4_2() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: min must not be greater than max}}
@@ -61,6 +60,7 @@ __attribute__((amdgpu_num_vgpr(0))) kernel void kernel_num_vgpr_0() {}
 
 kernel __attribute__((amdgpu_flat_work_group_size(32, 64))) void kernel_flat_work_group_size_32_64() {}
 kernel __attribute__((amdgpu_waves_per_eu(2))) void kernel_waves_per_eu_2() {}
+kernel __attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {}
 kernel __attribute__((amdgpu_waves_per_eu(2, 4))) void kernel_waves_per_eu_2_4() {}
 kernel __attribute__((amdgpu_num_sgpr(32))) void kernel_num_sgpr_32() {}
 kernel __attribute__((amdgpu_num_vgpr(64))) void kernel_num_vgpr_64() {}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -156,15 +156,15 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
   }
 }
 
-std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
-  const Function &F) const {
+std::pair<unsigned, unsigned>
+AMDGPUSubtarget::getFlatWorkGroupSizes(const Function &F) const {
   // Default minimum/maximum flat work group sizes.
   std::pair<unsigned, unsigned> Default =
-    getDefaultFlatWorkGroupSize(F.getCallingConv());
+      getDefaultFlatWorkGroupSize(F.getCallingConv());
 
   // Requested minimum/maximum flat work group sizes.
   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
-    F, "amdgpu-flat-work-group-size", Default);
+      F, "amdgpu-flat-work-group-size", Default);
 
   // Make sure requested minimum is less than requested maximum.
   if (Requested.first > Requested.second)
@@ -186,23 +186,29 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
   // sizes limits the achievable maximum, and we aim to support enough waves per
   // EU so that we can concurrently execute all waves of a single workgroup of
   // maximum size on a CU.
-  std::pair<unsigned, unsigned> Default = {
+  std::pair<unsigned, unsigned> WavesPerEU = {
       getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second),
       getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
-  Default.first = std::min(Default.first, Default.second);
-
-  // Make sure requested minimum is within the default range and lower than the
-  // requested maximum. The latter must not violate target specification.
-  if (RequestedWavesPerEU.first < Default.first ||
-      RequestedWavesPerEU.first > Default.second ||
-      RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
-      RequestedWavesPerEU.second > getMaxWavesPerEU())
-    return Default;
-
-  // We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
-  RequestedWavesPerEU.second =
-      std::min(RequestedWavesPerEU.second, Default.second);
-  return RequestedWavesPerEU;
+  WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
+
+  // Requested minimum must not violate subtarget's specifications and be no
+  // greater than maximum.
+  if (RequestedWavesPerEU.first &&
+      (RequestedWavesPerEU.first < getMinWavesPerEU() ||
+       RequestedWavesPerEU.first > RequestedWavesPerEU.second))
+    return WavesPerEU;
+  // Requested maximum must not violate subtarget's specifications.
+  if (RequestedWavesPerEU.second > getMaxWavesPerEU())
+    return WavesPerEU;
+
+  // A requested maximum may limit both the final minimum and maximum, but
+  // not increase them. A requested minimum can either decrease or increase the
+  // default minimum as long as it doesn't exceed the maximum.
+  WavesPerEU.second = std::min(WavesPerEU.second, RequestedWavesPerEU.second);
+  if (RequestedWavesPerEU.first)
+    WavesPerEU.first = RequestedWavesPerEU.first;
+  WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
+  return WavesPerEU;
 }
 
 std::pair<unsigned, unsigned>
@@ -229,7 +235,7 @@ std::pair<unsigned, unsigned>
 AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
                                unsigned LDSBytes, const Function &F) const {
   // Default minimum/maximum number of waves per execution unit.
-  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
+  std::pair<unsigned, unsigned> Default(0, getMaxWavesPerEU());
 
   // Requested minimum/maximum number of waves per execution unit.
   std::pair<unsigned, unsigned> Requested =

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -127,7 +127,9 @@ class AMDGPUSubtarget {
   /// Returns the target minimum/maximum number of waves per EU. This is based
   /// on the minimum/maximum number of \p RequestedWavesPerEU and further
   /// limited by the maximum achievable occupancy derived from the range of \p
-  /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
+  /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A
+  /// minimum requested waves/EU value of 0 indicates an intent to not restrict
+  /// the minimum target occupancy.
   std::pair<unsigned, unsigned>
   getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
                          std::pair<unsigned, unsigned> FlatWorkGroupSizes,

diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -225,3 +225,15 @@ entry:
   ret void
 }
 attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"}
+
+; At most 2 waves per execution unit.
+; CHECK-LABEL: {{^}}empty_at_most_2:
+; CHECK: SGPRBlocks: 12
+; CHECK: VGPRBlocks: 21
+; CHECK: NumSGPRsForWavesPerEU: 102
+; CHECK: NumVGPRsForWavesPerEU: 85
+define amdgpu_kernel void @empty_at_most_2() #13 {
+entry:
+  ret void
+}
+attributes #13 = {"amdgpu-waves-per-eu"="0,2"}
diff --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
@@ -57,5 +57,5 @@ entry:
   ret void
 }
 
-attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
-attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
+attributes #0 = { "amdgpu-waves-per-eu"="1" }
+attributes #1 = { "amdgpu-waves-per-eu"="1" "amdgpu-flat-work-group-size"="1,1024" }