Skip to content

[AMDGPU][NFCI] Remove preload kernarg alloc dep on DAG isel path #96030

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 78 additions & 55 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2465,71 +2465,94 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
// these from the dispatch pointer.
}

static bool allocPreloadKernArg(uint64_t &LastExplicitArgOffset,
uint64_t ExplicitArgOffset, uint64_t ArgOffset,
unsigned ArgSize, unsigned Idx,
MachineFunction &MF, const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info, CCState &CCInfo) {
if (ArgOffset >= ExplicitArgOffset)
return false;

const Align KernelArgBaseAlign = Align(16);
Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
unsigned NumAllocSGPRs = alignTo(ArgSize, 4) / 4;

// Arg is preloaded into the previous SGPR.
if (ArgSize < 4 && Alignment < 4) {
Info.getArgInfo().PreloadKernArgs[Idx].Regs.push_back(
Info.getArgInfo().PreloadKernArgs[Idx - 1].Regs[0]);
return true;
}

unsigned Padding = ArgOffset - LastExplicitArgOffset;
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
// Check for free user SGPRs for preloading.
if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
Info.getUserSGPRInfo().getNumFreeUserSGPRs()) {
return false;
}

// Preload this argument.
const TargetRegisterClass *RC =
TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
SmallVectorImpl<MCRegister> *PreloadRegs =
Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, Idx, PaddingSGPRs);

if (PreloadRegs->size() > 1)
RC = &AMDGPU::SGPR_32RegClass;

for (auto &Reg : *PreloadRegs) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No auto, no reference

assert(Reg);
MF.addLiveIn(Reg, RC);
CCInfo.AllocateReg(Reg);
}

LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
return true;
}

// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
// sequential starting from the first argument.
void SITargetLowering::allocatePreloadKernArgSGPRs(
CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, MachineFunction &MF,
const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
Function &F = MF.getFunction();
unsigned LastExplicitArgOffset =
MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
bool InPreloadSequence = true;
unsigned InIdx = 0;
const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
uint64_t ExplicitArgOffset = BaseOffset;
uint64_t LastExplicitArgOffset = ExplicitArgOffset;
unsigned LocIdx = 0;
for (auto &Arg : F.args()) {
if (!InPreloadSequence || !Arg.hasInRegAttr())
const DataLayout &DL = F.getParent()->getDataLayout();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull out of loop

const bool IsByRef = Arg.hasByRefAttr();
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use TypeSize?

if (AllocSize == 0)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why break and not continue?

break;

int ArgIdx = Arg.getArgNo();
// Don't preload non-original args or parts not in the current preload
// sequence.
if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
(int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: You can probably move ArgOffset closer to its use.

ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
if (!Arg.hasInRegAttr())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: You can move this at the top of the loop, there's no need to compute all these offsets and whatnot if we're going to bail out anyway.

break;

for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
(int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
InIdx++) {
assert(ArgLocs[ArgIdx].isMemLoc());
auto &ArgLoc = ArgLocs[InIdx];
const Align KernelArgBaseAlign = Align(16);
unsigned ArgOffset = ArgLoc.getLocMemOffset();
Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
unsigned NumAllocSGPRs =
alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;

// Arg is preloaded into the previous SGPR.
if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
continue;
}

unsigned Padding = ArgOffset - LastExplicitArgOffset;
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
// Check for free user SGPRs for preloading.
if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
SGPRInfo.getNumFreeUserSGPRs()) {
InPreloadSequence = false;
break;
}

// Preload this argument.
const TargetRegisterClass *RC =
TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
SmallVectorImpl<MCRegister> *PreloadRegs =
Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);

if (PreloadRegs->size() > 1)
RC = &AMDGPU::SGPR_32RegClass;
for (auto &Reg : *PreloadRegs) {
assert(Reg);
MF.addLiveIn(Reg, RC);
CCInfo.AllocateReg(Reg);
if (!ArgLocs.size()) {
// global isel
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The DAG and GlobalISel paths should be identical here. The GlobalISel calling convention stuff is still just a shim on top of the same handling

allocPreloadKernArg(LastExplicitArgOffset, ExplicitArgOffset,
ArgOffset, AllocSize, Arg.getArgNo(), MF,
TRI, Info, CCInfo);
} else {
// DAG isel
for (; LocIdx < ArgLocs.size(); LocIdx++) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Are you planning to use LocIdx for anything else? If not, you can probably move the declaration here.

CCValAssign &ArgLoc = ArgLocs[LocIdx];
assert(ArgLoc.isMemLoc());
uint64_t LocOffset = ArgLoc.getLocMemOffset();
unsigned LocSize = ArgLoc.getLocVT().getStoreSize();
if (!allocPreloadKernArg(LastExplicitArgOffset, ExplicitArgOffset,
LocOffset, LocSize, LocIdx, MF, TRI, Info,
CCInfo))
break;
}

LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
}
}
}
Expand Down Expand Up @@ -2854,7 +2877,7 @@ SDValue SITargetLowering::LowerFormalArguments(
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
if (IsKernel && Subtarget->hasKernargPreload())
allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, MF, *TRI, *Info);

allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
} else if (!IsGraphics) {
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {

void allocatePreloadKernArgSGPRs(CCState &CCInfo,
SmallVectorImpl<CCValAssign> &ArgLocs,
const SmallVectorImpl<ISD::InputArg> &Ins,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
Expand Down
Loading