Skip to content

Commit b98f902

Browse files
committed
GlobalISel: Restructure argument lowering loop in handleAssignments
This was structured in a way that implied every split argument is in memory, or in registers. It is possible to pass an original argument partially in registers, and partially in memory. Transpose the logic here to only consider a single piece at a time. Every individual CCValAssign should be treated independently, and any merge to original value needs to be handled later. This is in preparation for merging some preprocessing hacks in the AMDGPU calling convention lowering into the generic code. I'm also not sure what the correct behavior for memlocs where the promoted size is larger than the original value. I've opted to clamp the memory access size to not exceed the value register to avoid the explicit trunc/extend/vector widen/vector extract instruction. This happens for AMDGPU for i8 arguments that end up stack passed, which are promoted to i16 (I think this is a preexisting DAG bug though, and they should not really be promoted when in memory).
1 parent d523a8d commit b98f902

File tree

4 files changed

+83
-69
lines changed

4 files changed

+83
-69
lines changed

llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h

+1
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ class CallLowering {
147147
virtual void assignValueToAddress(const ArgInfo &Arg, Register Addr,
148148
uint64_t Size, MachinePointerInfo &MPO,
149149
CCValAssign &VA) {
150+
assert(Arg.Regs.size() == 1);
150151
assignValueToAddress(Arg.Regs[0], Addr, Size, MPO, VA);
151152
}
152153

llvm/lib/CodeGen/GlobalISel/CallLowering.cpp

+74-67
Original file line numberDiff line numberDiff line change
@@ -313,80 +313,87 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
313313
EVT VAVT = VA.getValVT();
314314
const LLT OrigTy = getLLTForType(*Args[i].Ty, DL);
315315

316-
if (VA.isRegLoc()) {
317-
if (Handler.isIncomingArgumentHandler() && VAVT != OrigVT) {
318-
if (VAVT.getSizeInBits() < OrigVT.getSizeInBits()) {
319-
// Expected to be multiple regs for a single incoming arg.
320-
unsigned NumArgRegs = Args[i].Regs.size();
321-
if (NumArgRegs < 2)
322-
return false;
323-
324-
assert((j + (NumArgRegs - 1)) < ArgLocs.size() &&
325-
"Too many regs for number of args");
326-
for (unsigned Part = 0; Part < NumArgRegs; ++Part) {
327-
// There should be Regs.size() ArgLocs per argument.
328-
VA = ArgLocs[j + Part];
329-
Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA);
330-
}
331-
j += NumArgRegs - 1;
332-
// Merge the split registers into the expected larger result vreg
333-
// of the original call.
334-
MIRBuilder.buildMerge(Args[i].OrigRegs[0], Args[i].Regs);
335-
continue;
336-
}
337-
const LLT VATy(VAVT.getSimpleVT());
338-
Register NewReg =
339-
MIRBuilder.getMRI()->createGenericVirtualRegister(VATy);
340-
Handler.assignValueToReg(NewReg, VA.getLocReg(), VA);
341-
// If it's a vector type, we either need to truncate the elements
342-
// or do an unmerge to get the lower block of elements.
343-
if (VATy.isVector() &&
344-
VATy.getNumElements() > OrigVT.getVectorNumElements()) {
345-
// Just handle the case where the VA type is 2 * original type.
346-
if (VATy.getNumElements() != OrigVT.getVectorNumElements() * 2) {
347-
LLVM_DEBUG(dbgs()
348-
<< "Incoming promoted vector arg has too many elts");
349-
return false;
350-
}
351-
auto Unmerge = MIRBuilder.buildUnmerge({OrigTy, OrigTy}, {NewReg});
352-
MIRBuilder.buildCopy(ArgReg, Unmerge.getReg(0));
353-
} else {
354-
MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0);
316+
// Expected to be multiple regs for a single incoming arg.
317+
// There should be Regs.size() ArgLocs per argument.
318+
unsigned NumArgRegs = Args[i].Regs.size();
319+
320+
assert((j + (NumArgRegs - 1)) < ArgLocs.size() &&
321+
"Too many regs for number of args");
322+
for (unsigned Part = 0; Part < NumArgRegs; ++Part) {
323+
// There should be Regs.size() ArgLocs per argument.
324+
VA = ArgLocs[j + Part];
325+
if (VA.isMemLoc()) {
326+
// Don't currently support loading/storing a type that needs to be split
327+
// to the stack. Should be easy, just not implemented yet.
328+
if (NumArgRegs > 1) {
329+
LLVM_DEBUG(
330+
dbgs()
331+
<< "Load/store a split arg to/from the stack not implemented yet\n");
332+
return false;
355333
}
356-
} else if (!Handler.isIncomingArgumentHandler()) {
357-
assert((j + (Args[i].Regs.size() - 1)) < ArgLocs.size() &&
358-
"Too many regs for number of args");
359-
// This is an outgoing argument that might have been split.
360-
for (unsigned Part = 0; Part < Args[i].Regs.size(); ++Part) {
361-
// There should be Regs.size() ArgLocs per argument.
362-
VA = ArgLocs[j + Part];
363-
Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA);
334+
335+
// FIXME: Use correct address space for pointer size
336+
EVT LocVT = VA.getValVT();
337+
unsigned MemSize = LocVT == MVT::iPTR ? DL.getPointerSize()
338+
: LocVT.getStoreSize();
339+
unsigned Offset = VA.getLocMemOffset();
340+
MachinePointerInfo MPO;
341+
Register StackAddr = Handler.getStackAddress(MemSize, Offset, MPO);
342+
Handler.assignValueToAddress(Args[i], StackAddr,
343+
MemSize, MPO, VA);
344+
continue;
345+
}
346+
347+
assert(VA.isRegLoc() && "custom loc should have been handled already");
348+
349+
if (OrigVT.getSizeInBits() >= VAVT.getSizeInBits() ||
350+
!Handler.isIncomingArgumentHandler()) {
351+
// This is an argument that might have been split. There should be
352+
// Regs.size() ArgLocs per argument.
353+
354+
// Insert the argument copies. If VAVT < OrigVT, we'll insert the merge
355+
// to the original register after handling all of the parts.
356+
Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA);
357+
continue;
358+
}
359+
360+
// This ArgLoc covers multiple pieces, so we need to split it.
361+
const LLT VATy(VAVT.getSimpleVT());
362+
Register NewReg =
363+
MIRBuilder.getMRI()->createGenericVirtualRegister(VATy);
364+
Handler.assignValueToReg(NewReg, VA.getLocReg(), VA);
365+
// If it's a vector type, we either need to truncate the elements
366+
// or do an unmerge to get the lower block of elements.
367+
if (VATy.isVector() &&
368+
VATy.getNumElements() > OrigVT.getVectorNumElements()) {
369+
// Just handle the case where the VA type is 2 * original type.
370+
if (VATy.getNumElements() != OrigVT.getVectorNumElements() * 2) {
371+
LLVM_DEBUG(dbgs()
372+
<< "Incoming promoted vector arg has too many elts");
373+
return false;
364374
}
365-
j += Args[i].Regs.size() - 1;
375+
auto Unmerge = MIRBuilder.buildUnmerge({OrigTy, OrigTy}, {NewReg});
376+
MIRBuilder.buildCopy(ArgReg, Unmerge.getReg(0));
366377
} else {
367-
Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA);
378+
MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0);
368379
}
369-
} else if (VA.isMemLoc()) {
370-
// Don't currently support loading/storing a type that needs to be split
371-
// to the stack. Should be easy, just not implemented yet.
372-
if (Args[i].Regs.size() > 1) {
373-
LLVM_DEBUG(
374-
dbgs()
375-
<< "Load/store a split arg to/from the stack not implemented yet");
376-
return false;
380+
}
381+
382+
// Now that all pieces have been handled, re-pack any arguments into any
383+
// wider, original registers.
384+
if (Handler.isIncomingArgumentHandler()) {
385+
if (VAVT.getSizeInBits() < OrigVT.getSizeInBits()) {
386+
assert(NumArgRegs >= 2);
387+
388+
// Merge the split registers into the expected larger result vreg
389+
// of the original call.
390+
MIRBuilder.buildMerge(Args[i].OrigRegs[0], Args[i].Regs);
377391
}
378-
MVT VT = MVT::getVT(Args[i].Ty);
379-
unsigned Size = VT == MVT::iPTR ? DL.getPointerSize()
380-
: alignTo(VT.getSizeInBits(), 8) / 8;
381-
unsigned Offset = VA.getLocMemOffset();
382-
MachinePointerInfo MPO;
383-
Register StackAddr = Handler.getStackAddress(Size, Offset, MPO);
384-
Handler.assignValueToAddress(Args[i], StackAddr, Size, MPO, VA);
385-
} else {
386-
// FIXME: Support byvals and other weirdness
387-
return false;
388392
}
393+
394+
j += NumArgRegs - 1;
389395
}
396+
390397
return true;
391398
}
392399

llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
186186
if (!Arg.IsFixed)
187187
MaxSize = 0;
188188

189+
assert(Arg.Regs.size() == 1);
190+
189191
Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
190192
? extendRegister(Arg.Regs[0], VA, MaxSize)
191193
: Arg.Regs[0];

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+6-2
Original file line numberDiff line numberDiff line change
@@ -144,13 +144,17 @@ struct IncomingArgHandler : public AMDGPUValueHandler {
144144
}
145145
}
146146

147-
void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
147+
void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize,
148148
MachinePointerInfo &MPO, CCValAssign &VA) override {
149149
MachineFunction &MF = MIRBuilder.getMF();
150150

151+
// The reported memory location may be wider than the value.
152+
const LLT RegTy = MRI.getType(ValVReg);
153+
MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
154+
151155
// FIXME: Get alignment
152156
auto MMO = MF.getMachineMemOperand(
153-
MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
157+
MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize,
154158
inferAlignFromPtrInfo(MF, MPO));
155159
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
156160
}

0 commit comments

Comments
 (0)