Skip to content

[AArch64]Fix invalid use of ld1/st1 in stack alloc #105518

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 16 additions & 11 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3020,6 +3020,7 @@ static void computeCalleeSaveRegisterPairs(
ByteOffset += StackFillDir * StackHazardSize;
LastReg = RPI.Reg1;

int Scale = RPI.getScale();
// Add the next reg to the pair if it is in the same register class.
if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
Register NextReg = CSI[i + RegInc].getReg();
Expand All @@ -3045,9 +3046,14 @@ static void computeCalleeSaveRegisterPairs(
case RegPairInfo::PPR:
break;
case RegPairInfo::ZPR:
if (AFI->getPredicateRegForFillSpill() != 0)
if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
if (AFI->getPredicateRegForFillSpill() != 0 &&
((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) {
// Calculate offset of register pair to see if pair instruction can be
// used.
int Offset = (ScalableByteOffset + StackFillDir * 2 * Scale) / Scale;
if ((-16 <= Offset && Offset <= 14) && (Offset % 2 == 0))
RPI.Reg2 = NextReg;
}
break;
case RegPairInfo::VG:
break;
Expand Down Expand Up @@ -3087,7 +3093,6 @@ static void computeCalleeSaveRegisterPairs(
if (NeedsWinCFI &&
RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
int Scale = RPI.getScale();

int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
assert(OffsetPre % Scale == 0);
Expand Down Expand Up @@ -3356,8 +3361,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MachineMemOperand::MOStore, Size, Alignment));
MIB.addReg(PnReg);
MIB.addReg(AArch64::SP)
.addImm(RPI.Offset) // [sp, #offset*scale],
// where factor*scale is implicit
.addImm(RPI.Offset / 2) // [sp, #imm*2*vscale],
// where 2*vscale is implicit
.setMIFlag(MachineInstr::FrameSetup);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
Expand All @@ -3378,8 +3383,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
.addReg(AArch64::SP)
.addImm(RPI.Offset) // [sp, #offset*scale],
// where factor*scale is implicit
.addImm(RPI.Offset) // [sp, #offset*vscale],
// where factor*vscale is implicit
.setMIFlag(MachineInstr::FrameSetup);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
Expand Down Expand Up @@ -3523,8 +3528,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineMemOperand::MOLoad, Size, Alignment));
MIB.addReg(PnReg);
MIB.addReg(AArch64::SP)
.addImm(RPI.Offset) // [sp, #offset*scale]
// where factor*scale is implicit
.addImm(RPI.Offset / 2) // [sp, #imm*2*vscale]
// where 2*vscale is implicit
.setMIFlag(MachineInstr::FrameDestroy);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
Expand All @@ -3541,8 +3546,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
}
MIB.addReg(Reg1, getDefRegState(true));
MIB.addReg(AArch64::SP)
.addImm(RPI.Offset) // [sp, #offset*scale]
// where factor*scale is implicit
.addImm(RPI.Offset) // [sp, #offset*vscale]
// where factor*vscale is implicit
.setMIFlag(MachineInstr::FrameDestroy);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
Expand Down
68 changes: 36 additions & 32 deletions llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
Original file line number Diff line number Diff line change
Expand Up @@ -332,24 +332,25 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
Expand All @@ -372,15 +373,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
Expand Down Expand Up @@ -427,24 +429,25 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: ptrue pn8.b
; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
; FP-CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
Expand All @@ -465,15 +468,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: .cfi_restore vg
; FP-CHECK-NEXT: addvl sp, sp, #1
; FP-CHECK-NEXT: ptrue pn8.b
; FP-CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; FP-CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
Expand Down
Loading
Loading