Skip to content

[AMDGPU][Clang] Allow amdgpu-waves-per-eu attribute to lower target occupancy range #138284

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 14 additions & 13 deletions clang/lib/CodeGen/Targets/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -743,20 +743,21 @@ void CodeGenModule::handleAMDGPUWavesPerEUAttr(
llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
unsigned Min =
Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
unsigned Max =
Attr->getMax()
? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
: 0;

if (Min != 0) {
assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");

std::string AttrVal = llvm::utostr(Min);
if (Max != 0)
AttrVal = AttrVal + "," + llvm::utostr(Max);
F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
} else
assert(Max == 0 && "Max must be zero");
if (Attr->getMax()) {
unsigned Max =
Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
assert(Min == 0 || (Min != 0 && Max != 0) &&
"Min must be non-zero when Max is non-zero");
assert(Min <= Max && "Min must be less than or equal to Max");
// Do not add the attribute if min,max=0,0.
if (Max != 0) {
std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
}
} else if (Min != 0) {
F->addFnAttr("amdgpu-waves-per-eu", llvm::utostr(Min));
}
}

std::unique_ptr<TargetCodeGenInfo>
Expand Down
5 changes: 0 additions & 5 deletions clang/lib/Sema/SemaAMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,6 @@ static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
if (MaxExpr && !S.checkUInt32Argument(Attr, MaxExpr, Max, 1))
return true;

if (Min == 0 && Max != 0) {
S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
<< &Attr << 0;
return true;
}
if (Max != 0 && Min > Max) {
S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
<< &Attr << 1;
Expand Down
53 changes: 53 additions & 0 deletions clang/test/CodeGenHIP/amdgpu-waves-per-eu.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device -emit-llvm -o - %s | FileCheck %s

// COM: Most tests are in the OpenCL semastics, this is just a verification for HIP

#define __global__ __attribute__((global))

//.
// CHECK: @__hip_cuid_ = addrspace(1) global i8 0
// CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata"
//.
// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
// CHECK-LABEL: define {{[^@]+}}@_Z21kernel_waves_per_eu_0v
// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: ret void
//
__global__ __attribute__((amdgpu_waves_per_eu(0))) void kernel_waves_per_eu_0() {}

// Equivalent to kernel_waves_per_eu_0.
// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
// CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_0_0v
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: ret void
//
__global__ __attribute__((amdgpu_waves_per_eu(0, 0))) void kernel_waves_per_eu_0_0() {}

// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
// CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_0_4v
// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: ret void
//
__global__ __attribute__((amdgpu_waves_per_eu(0, 4))) void kernel_waves_per_eu_0_4() {}

// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
// CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_1_4v
// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: ret void
//
__global__ __attribute__((amdgpu_waves_per_eu(1, 4))) void kernel_waves_per_eu_1_4() {}
//.
// CHECK: attributes #[[ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="0,4" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
// CHECK: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="1,4" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
//.
// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
// CHECK: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
// CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
//.
2 changes: 1 addition & 1 deletion clang/test/SemaOpenCL/amdgpu-attrs.cl
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ __attribute__((amdgpu_num_sgpr(4294967296))) kernel void kernel_num_sgpr_L() {}
__attribute__((amdgpu_num_vgpr(4294967296))) kernel void kernel_num_vgpr_L() {} // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}

__attribute__((amdgpu_flat_work_group_size(0, 64))) kernel void kernel_flat_work_group_size_0_64() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: max must be 0 since min is 0}}
__attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: max must be 0 since min is 0}}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing clang codegen test changes that show the new accepted values. This is still not emitting minimums of 0 though, so this is just losing a test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[0,4] is now a valid range (i.e., no minimum requested, at most 4) so I moved it below instead of deleting it. I also added some HIP codegen tests.


__attribute__((amdgpu_flat_work_group_size(64, 32))) kernel void kernel_flat_work_group_size_64_32() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: min must not be greater than max}}
__attribute__((amdgpu_waves_per_eu(4, 2))) kernel void kernel_waves_per_eu_4_2() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: min must not be greater than max}}
Expand All @@ -61,6 +60,7 @@ __attribute__((amdgpu_num_vgpr(0))) kernel void kernel_num_vgpr_0() {}

kernel __attribute__((amdgpu_flat_work_group_size(32, 64))) void kernel_flat_work_group_size_32_64() {}
kernel __attribute__((amdgpu_waves_per_eu(2))) void kernel_waves_per_eu_2() {}
kernel __attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {}
kernel __attribute__((amdgpu_waves_per_eu(2, 4))) void kernel_waves_per_eu_2_4() {}
kernel __attribute__((amdgpu_num_sgpr(32))) void kernel_num_sgpr_32() {}
kernel __attribute__((amdgpu_num_vgpr(64))) void kernel_num_vgpr_64() {}
46 changes: 26 additions & 20 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,15 +156,15 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
}
}

std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
const Function &F) const {
std::pair<unsigned, unsigned>
AMDGPUSubtarget::getFlatWorkGroupSizes(const Function &F) const {
// Default minimum/maximum flat work group sizes.
std::pair<unsigned, unsigned> Default =
getDefaultFlatWorkGroupSize(F.getCallingConv());
getDefaultFlatWorkGroupSize(F.getCallingConv());

// Requested minimum/maximum flat work group sizes.
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
F, "amdgpu-flat-work-group-size", Default);
F, "amdgpu-flat-work-group-size", Default);

// Make sure requested minimum is less than requested maximum.
if (Requested.first > Requested.second)
Expand All @@ -186,23 +186,29 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
// sizes limits the achievable maximum, and we aim to support enough waves per
// EU so that we can concurrently execute all waves of a single workgroup of
// maximum size on a CU.
std::pair<unsigned, unsigned> Default = {
std::pair<unsigned, unsigned> WavesPerEU = {
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second),
getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
Default.first = std::min(Default.first, Default.second);

// Make sure requested minimum is within the default range and lower than the
// requested maximum. The latter must not violate target specification.
if (RequestedWavesPerEU.first < Default.first ||
RequestedWavesPerEU.first > Default.second ||
RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
RequestedWavesPerEU.second > getMaxWavesPerEU())
return Default;

// We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
RequestedWavesPerEU.second =
std::min(RequestedWavesPerEU.second, Default.second);
return RequestedWavesPerEU;
WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);

// Requested minimum must not violate subtarget's specifications and be no
// greater than maximum.
if (RequestedWavesPerEU.first &&
(RequestedWavesPerEU.first < getMinWavesPerEU() ||
RequestedWavesPerEU.first > RequestedWavesPerEU.second))
return WavesPerEU;
// Requested maximum must not violate subtarget's specifications.
if (RequestedWavesPerEU.second > getMaxWavesPerEU())
return WavesPerEU;

// A requested maximum may limit both the final minimum and maximum, but
// not increase them. A requested minimum can either decrease or increase the
// default minimum as long as it doesn't exceed the maximum.
WavesPerEU.second = std::min(WavesPerEU.second, RequestedWavesPerEU.second);
if (RequestedWavesPerEU.first)
WavesPerEU.first = RequestedWavesPerEU.first;
WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
return WavesPerEU;
}

std::pair<unsigned, unsigned>
Expand All @@ -229,7 +235,7 @@ std::pair<unsigned, unsigned>
AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
unsigned LDSBytes, const Function &F) const {
// Default minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
std::pair<unsigned, unsigned> Default(0, getMaxWavesPerEU());

// Requested minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Requested =
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,9 @@ class AMDGPUSubtarget {
/// Returns the target minimum/maximum number of waves per EU. This is based
/// on the minimum/maximum number of \p RequestedWavesPerEU and further
/// limited by the maximum achievable occupancy derived from the range of \p
/// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
/// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A
/// minimum requested waves/EU value of 0 indicates an intent to not restrict
/// the minimum target occupancy.
std::pair<unsigned, unsigned>
getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
std::pair<unsigned, unsigned> FlatWorkGroupSizes,
Expand Down
12 changes: 12 additions & 0 deletions llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,15 @@ entry:
ret void
}
attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"}

; At most 2 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_most_2:
; CHECK: SGPRBlocks: 12
; CHECK: VGPRBlocks: 21
; CHECK: NumSGPRsForWavesPerEU: 102
; CHECK: NumVGPRsForWavesPerEU: 85
define amdgpu_kernel void @empty_at_most_2() #13 {
entry:
ret void
}
attributes #13 = {"amdgpu-waves-per-eu"="0,2"}
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,5 @@ entry:
ret void
}

attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
attributes #0 = { "amdgpu-waves-per-eu"="1" }
attributes #1 = { "amdgpu-waves-per-eu"="1" "amdgpu-flat-work-group-size"="1,1024" }
Loading