Skip to content

Commit 30a7332

Browse files
kerbowayxsamliu
authored andcommitted
[AMDGPU] Support preloading hidden kernel arguments (llvm#98861)
Adds hidden kernel arguments to the function signature and marks them inreg if they should be preloaded into user SGPRs. The normal kernarg preloading logic then takes over with some additional checks for the correct implicitarg_ptr alignment. Special care is needed so that metadata for the hidden arguments is not added twice when generating the code object. Change-Id: Iea2929f1d56aecb9e15ed942bcc6b4a1831dd0e2
1 parent e631fe1 commit 30a7332

File tree

10 files changed

+1101
-9
lines changed

10 files changed

+1101
-9
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1319,6 +1319,10 @@ The AMDGPU backend supports the following LLVM IR attributes.
13191319
the frame. This is an internal detail of how LDS variables are lowered,
13201320
language front ends should not set this attribute.
13211321

1322+
"amdgpu-hidden-argument" This attribute is used internally by the backend to mark function arguments
1323+
as hidden. Hidden arguments are managed by the compiler and are not part of
1324+
the explicit arguments supplied by the user.
1325+
13221326
======================================= ==========================================================
13231327

13241328
Calling Conventions
@@ -5336,6 +5340,12 @@ may insert a trap instruction at the start of the kernel prologue to manage
53365340
situations where kernarg preloading is attempted on hardware with incompatible
53375341
firmware.
53385342

5343+
With code object V5 and later, hidden kernel arguments that are normally
5344+
accessed through the Implicit Argument Ptr, may be preloaded into User SGPRs.
5345+
These arguments are added to the kernel function signature and are marked with
5346+
the attributes "inreg" and "amdgpu-hidden-argument". (See
5347+
:ref:`amdgpu-llvm-ir-attributes-table`).
5348+
53395349
.. _amdgpu-amdhsa-kernel-prolog:
53405350

53415351
Kernel Prolog

llvm/include/llvm/IR/Argument.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ class Argument final : public Value {
171171
/// Check if an argument has a given attribute.
172172
bool hasAttribute(Attribute::AttrKind Kind) const;
173173

174+
bool hasAttribute(StringRef Kind) const;
175+
174176
Attribute getAttribute(Attribute::AttrKind Kind) const;
175177

176178
/// Method for support type inquiry through isa, cast, and dyn_cast.

llvm/include/llvm/IR/Function.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,9 @@ class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
418418
/// check if an attributes is in the list of attributes.
419419
bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const;
420420

421+
/// Check if an attribute is in the list of attributes.
422+
bool hasParamAttribute(unsigned ArgNo, StringRef Kind) const;
423+
421424
/// gets the attribute from the list of attributes.
422425
Attribute getAttributeAtIndex(unsigned i, Attribute::AttrKind Kind) const;
423426

llvm/lib/IR/Function.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,10 @@ bool Argument::hasAttribute(Attribute::AttrKind Kind) const {
330330
return getParent()->hasParamAttribute(getArgNo(), Kind);
331331
}
332332

333+
bool Argument::hasAttribute(StringRef Kind) const {
334+
return getParent()->hasParamAttribute(getArgNo(), Kind);
335+
}
336+
333337
Attribute Argument::getAttribute(Attribute::AttrKind Kind) const {
334338
return getParent()->getParamAttribute(getArgNo(), Kind);
335339
}
@@ -680,6 +684,10 @@ bool Function::hasParamAttribute(unsigned ArgNo,
680684
return AttributeSets.hasParamAttr(ArgNo, Kind);
681685
}
682686

687+
bool Function::hasParamAttribute(unsigned ArgNo, StringRef Kind) const {
688+
return AttributeSets.hasParamAttr(ArgNo, Kind);
689+
}
690+
683691
Attribute Function::getAttributeAtIndex(unsigned i,
684692
Attribute::AttrKind Kind) const {
685693
return AttributeSets.getAttributeAtIndex(i, Kind);

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,12 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
259259
auto &Func = MF.getFunction();
260260
unsigned Offset = 0;
261261
auto Args = HSAMetadataDoc->getArrayNode();
262-
for (auto &Arg : Func.args())
262+
for (auto &Arg : Func.args()) {
263+
if (Arg.hasAttribute("amdgpu-hidden-argument"))
264+
continue;
265+
263266
emitKernelArg(Arg, Offset, Args);
267+
}
264268

265269
emitHiddenKernelArgs(MF, Offset, Args);
266270

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Lines changed: 200 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
#include "AMDGPU.h"
1515
#include "GCNSubtarget.h"
16+
#include "llvm/ADT/StringExtras.h"
17+
#include "llvm/Analysis/ValueTracking.h"
1618
#include "llvm/CodeGen/TargetPassConfig.h"
1719
#include "llvm/IR/IRBuilder.h"
1820
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,110 @@ class PreloadKernelArgInfo {
3133
const GCNSubtarget &ST;
3234
unsigned NumFreeUserSGPRs;
3335

34-
public:
35-
SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
36+
enum HiddenArg : unsigned {
37+
HIDDEN_BLOCK_COUNT_X,
38+
HIDDEN_BLOCK_COUNT_Y,
39+
HIDDEN_BLOCK_COUNT_Z,
40+
HIDDEN_GROUP_SIZE_X,
41+
HIDDEN_GROUP_SIZE_Y,
42+
HIDDEN_GROUP_SIZE_Z,
43+
HIDDEN_REMAINDER_X,
44+
HIDDEN_REMAINDER_Y,
45+
HIDDEN_REMAINDER_Z,
46+
END_HIDDEN_ARGS
47+
};
48+
49+
// Stores information about a specific hidden argument.
50+
struct HiddenArgInfo {
51+
// Offset in bytes from the location in the kernearg segment pointed to by
52+
// the implicitarg pointer.
53+
uint8_t Offset;
54+
// The size of the hidden argument in bytes.
55+
uint8_t Size;
56+
// The name of the hidden argument in the kernel signature.
57+
const char *Name;
58+
};
59+
60+
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61+
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
62+
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
63+
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
64+
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
65+
{22, 2, "_hidden_remainder_z"}};
66+
67+
static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
68+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
69+
if (HiddenArgs[I].Offset == Offset)
70+
return static_cast<HiddenArg>(I);
71+
72+
return END_HIDDEN_ARGS;
73+
}
74+
75+
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
76+
if (HA < END_HIDDEN_ARGS)
77+
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
78+
79+
llvm_unreachable("Unexpected hidden argument.");
80+
}
81+
82+
static const char *getHiddenArgName(HiddenArg HA) {
83+
if (HA < END_HIDDEN_ARGS) {
84+
return HiddenArgs[HA].Name;
85+
}
86+
llvm_unreachable("Unexpected hidden argument.");
87+
}
3688

89+
// Clones the function after adding implicit arguments to the argument list
90+
// and returns the new updated function. Preloaded implicit arguments are
91+
// added up to and including the last one that will be preloaded, indicated by
92+
// LastPreloadIndex. Currently preloading is only performed on the totality of
93+
// sequential data from the kernarg segment including implicit (hidden)
94+
// arguments. This means that all arguments up to the last preloaded argument
95+
// will also be preloaded even if that data is unused.
96+
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
97+
FunctionType *FT = F.getFunctionType();
98+
LLVMContext &Ctx = F.getParent()->getContext();
99+
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
100+
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
101+
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
102+
103+
FunctionType *NFT =
104+
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
105+
Function *NF =
106+
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
107+
108+
NF->copyAttributesFrom(&F);
109+
NF->copyMetadata(&F, 0);
110+
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
111+
112+
F.getParent()->getFunctionList().insert(F.getIterator(), NF);
113+
NF->takeName(&F);
114+
NF->splice(NF->begin(), &F);
115+
116+
Function::arg_iterator NFArg = NF->arg_begin();
117+
for (Argument &Arg : F.args()) {
118+
Arg.replaceAllUsesWith(&*NFArg);
119+
NFArg->takeName(&Arg);
120+
++NFArg;
121+
}
122+
123+
AttrBuilder AB(Ctx);
124+
AB.addAttribute(Attribute::InReg);
125+
AB.addAttribute("amdgpu-hidden-argument");
126+
AttributeList AL = NF->getAttributes();
127+
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
128+
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
129+
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
130+
}
131+
132+
NF->setAttributes(AL);
133+
F.replaceAllUsesWith(NF);
134+
F.setCallingConv(CallingConv::C);
135+
136+
return NF;
137+
}
138+
139+
public:
37140
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38141
setInitialFreeUserSGPRsCount();
39142
}
@@ -86,6 +189,87 @@ class PreloadKernelArgInfo {
86189
<< "\n";
87190
return true;
88191
}
192+
193+
// Try to allocate SGPRs to preload implicit kernel arguments.
194+
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
195+
IRBuilder<> &Builder) {
196+
StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
197+
Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
198+
if (!ImplicitArgPtr)
199+
return;
200+
201+
const DataLayout &DL = F.getParent()->getDataLayout();
202+
// Pair is the load and the load offset.
203+
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
204+
for (auto *U : ImplicitArgPtr->users()) {
205+
Instruction *CI = dyn_cast<Instruction>(U);
206+
if (!CI || CI->getParent()->getParent() != &F)
207+
continue;
208+
209+
for (auto *U : CI->users()) {
210+
int64_t Offset = 0;
211+
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
212+
if (!Load) {
213+
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
214+
continue;
215+
216+
Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
217+
}
218+
219+
if (!Load || !Load->isSimple())
220+
continue;
221+
222+
// FIXME: Expand to handle 64-bit implicit args and large merged loads.
223+
LLVMContext &Ctx = F.getParent()->getContext();
224+
Type *LoadTy = Load->getType();
225+
HiddenArg HA = getHiddenArgFromOffset(Offset);
226+
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
227+
continue;
228+
229+
ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
230+
}
231+
}
232+
233+
if (ImplicitArgLoads.empty())
234+
return;
235+
236+
// Allocate loads in order of offset. We need to be sure that the implicit
237+
// argument can actually be preloaded.
238+
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
239+
240+
uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
241+
// If we fail to preload any implicit argument we know we don't have SGPRs
242+
// to preload any subsequent ones with larger offsets. Find the first
243+
// argument that we cannot preload.
244+
auto *PreloadEnd = std::find_if(
245+
ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
246+
[&](const std::pair<LoadInst *, unsigned> &Load) {
247+
unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
248+
unsigned LoadOffset = Load.second;
249+
if (!tryAllocPreloadSGPRs(LoadSize,
250+
LoadOffset + ImplicitArgsBaseOffset,
251+
LastExplicitArgOffset))
252+
return true;
253+
254+
LastExplicitArgOffset = LoadOffset + LoadSize;
255+
return false;
256+
});
257+
258+
if (PreloadEnd == ImplicitArgLoads.begin())
259+
return;
260+
261+
unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
262+
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
263+
assert(NF);
264+
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
265+
LoadInst *LoadInst = I->first;
266+
unsigned LoadOffset = I->second;
267+
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
268+
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
269+
Argument *Arg = NF->getArg(Index);
270+
LoadInst->replaceAllUsesWith(Arg);
271+
}
272+
}
89273
};
90274

91275
class AMDGPULowerKernelArguments : public FunctionPass {
@@ -169,6 +353,12 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
169353
uint64_t LastExplicitArgOffset = ExplicitArgOffset;
170354
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
171355

356+
// Guard against the situation where hidden arguments have already been
357+
// lowered and added to the kernel function signiture, i.e. in a situation
358+
// where this pass has run twice.
359+
if (Arg.hasAttribute("amdgpu-hidden-argument"))
360+
break;
361+
172362
if (DBG) {
173363
llvm::errs() << " arg: " << Arg
174364
<< " Arg.hasInRegAttr()=" << Arg.hasInRegAttr()
@@ -315,6 +505,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
315505
KernArgSegment->addRetAttr(
316506
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
317507

508+
if (InPreloadSequence) {
509+
uint64_t ImplicitArgsBaseOffset =
510+
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
511+
BaseOffset;
512+
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
513+
Builder);
514+
}
515+
318516
return true;
319517
}
320518

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,9 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
593593
MaxAlign = Align(1);
594594

595595
for (const Argument &Arg : F.args()) {
596+
if (Arg.hasAttribute("amdgpu-hidden-argument"))
597+
continue;
598+
596599
const bool IsByRef = Arg.hasByRefAttr();
597600
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
598601
Align Alignment = DL.getValueOrABITypeAlignment(

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2427,24 +2427,25 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
24272427
const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
24282428
const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
24292429
Function &F = MF.getFunction();
2430-
unsigned LastExplicitArgOffset =
2431-
MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2430+
unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
24322431
GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
24332432
bool InPreloadSequence = true;
24342433
unsigned InIdx = 0;
2434+
bool AlignedForImplictArgs = false;
2435+
unsigned ImplicitArgOffset = 0;
24352436
for (auto &Arg : F.args()) {
24362437
if (!InPreloadSequence || !Arg.hasInRegAttr())
24372438
break;
24382439

2439-
int ArgIdx = Arg.getArgNo();
2440+
unsigned ArgIdx = Arg.getArgNo();
24402441
// Don't preload non-original args or parts not in the current preload
24412442
// sequence.
2442-
if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2443-
(int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2443+
if (InIdx < Ins.size() &&
2444+
(!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
24442445
break;
24452446

24462447
for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2447-
(int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2448+
Ins[InIdx].getOrigArgIndex() == ArgIdx;
24482449
InIdx++) {
24492450
assert(ArgLocs[ArgIdx].isMemLoc());
24502451
auto &ArgLoc = ArgLocs[InIdx];
@@ -2454,6 +2455,18 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
24542455
unsigned NumAllocSGPRs =
24552456
alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
24562457

2458+
// Fix alignment for hidden arguments.
2459+
if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2460+
if (!AlignedForImplictArgs) {
2461+
ImplicitArgOffset =
2462+
alignTo(LastExplicitArgOffset,
2463+
Subtarget->getAlignmentForImplicitArgPtr()) -
2464+
LastExplicitArgOffset;
2465+
AlignedForImplictArgs = true;
2466+
}
2467+
ArgOffset += ImplicitArgOffset;
2468+
}
2469+
24572470
// Arg is preloaded into the previous SGPR.
24582471
if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
24592472
Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(

0 commit comments

Comments
 (0)