Skip to content

Commit 4250bec

Browse files
committed
[AArch64][SME] Support Windows/stack probes in MachineSMEABIPass
On Windows or with stack probes on other targets, additional code needs to be inserted after dynamic stack allocations to validate stack accesses and/or ensure enough stack space has been allocated. Rather than handle this case in the MachineSMEABIPass (like we do for the standard case), we allocate the memory for the lazy save buffer in SelectionDAG, which allows the existing expansions to emit the correct code. Note: This means in these cases, we may allocate a lazy save buffer when there are no lazy saves present in the function (as we have to allocate the buffer before the MachineSMEABIPass runs). Change-Id: If89ab54c4de79f6fe5513a6b387e9e349f7bc7d1
1 parent 26d884c commit 4250bec

File tree

7 files changed

+160
-14
lines changed

7 files changed

+160
-14
lines changed

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1642,6 +1642,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
16421642
}
16431643
case AArch64::InOutZAUsePseudo:
16441644
case AArch64::RequiresZASavePseudo:
1645+
case AArch64::SMEStateAllocPseudo:
16451646
case AArch64::COALESCER_BARRIER_FPR16:
16461647
case AArch64::COALESCER_BARRIER_FPR32:
16471648
case AArch64::COALESCER_BARRIER_FPR64:

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8154,7 +8154,39 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
81548154
if (Subtarget->hasCustomCallingConv())
81558155
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
81568156

8157-
if (!Subtarget->useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
8157+
if (Subtarget->useNewSMEABILowering() && !Attrs.hasAgnosticZAInterface()) {
8158+
if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
8159+
SDValue Size;
8160+
if (Attrs.hasZAState()) {
8161+
SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8162+
DAG.getConstant(1, DL, MVT::i32));
8163+
Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8164+
} else if (Attrs.hasAgnosticZAInterface()) {
8165+
SDValue Callee = DAG.getExternalSymbol(
8166+
"__arm_sme_state_size", getPointerTy(DAG.getDataLayout()));
8167+
auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
8168+
TargetLowering::CallLoweringInfo CLI(DAG);
8169+
CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8170+
CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1,
8171+
RetTy, Callee, {});
8172+
std::tie(Size, Chain) = LowerCallTo(CLI);
8173+
}
8174+
if (Size) {
8175+
SDValue Buffer = DAG.getNode(
8176+
ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8177+
{Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8178+
Chain = Buffer.getValue(1);
8179+
8180+
Register BufferPtr =
8181+
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8182+
Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8183+
Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
8184+
DAG.getVTList(MVT::Other), Chain);
8185+
FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
8186+
MFI.CreateVariableSizedObject(Align(16), nullptr);
8187+
}
8188+
}
8189+
} else {
81588190
// Old SME ABI lowering (deprecated):
81598191
// Create a 16 Byte TPIDR2 object. The dynamic buffer
81608192
// will be expanded and stored in the static object later using a

llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
239239
// Holds the SME function attributes (streaming mode, ZA/ZT0 state).
240240
SMEAttrs SMEFnAttrs;
241241

242+
// Holds the TPIDR2 block if allocated early (for Windows/stack probes
243+
// support).
244+
Register EarlyAllocSMESaveBuffer = AArch64::NoRegister;
245+
242246
// Note: The following properties are only used for the old SME ABI lowering:
243247
/// The frame-index for the TPIDR2 object used for lazy saves.
244248
TPIDR2Object TPIDR2;
@@ -257,6 +261,12 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
257261
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
258262
const override;
259263

264+
void setEarlyAllocSMESaveBuffer(Register Ptr) {
265+
EarlyAllocSMESaveBuffer = Ptr;
266+
}
267+
268+
Register getEarlyAllocSMESaveBuffer() { return EarlyAllocSMESaveBuffer; }
269+
260270
// Old SME ABI lowering state getters/setters:
261271
Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
262272
void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ let hasSideEffects = 1 in {
9090
def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
9191
}
9292

93+
def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
94+
9395
def CommitZAPseudo
9496
: Pseudo<(outs),
9597
(ins GPR64:$tpidr2_el0, i64imm:$restore_routine, variable_ops), []>,
@@ -105,6 +107,11 @@ def AArch64_requires_za_save
105107
[SDNPHasChain, SDNPInGlue]>;
106108
def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
107109

110+
def AArch64_sme_state_alloc
111+
: SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
112+
[SDNPHasChain]>;
113+
def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>;
114+
108115
//===----------------------------------------------------------------------===//
109116
// Instruction naming conventions.
110117
//===----------------------------------------------------------------------===//

llvm/lib/Target/AArch64/MachineSMEABIPass.cpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ struct MachineSMEABI : public MachineFunctionPass {
166166
SmallVector<BlockInfo> Blocks;
167167
SmallVector<ZAState> BundleStates;
168168
std::optional<TPIDR2State> TPIDR2Block;
169+
std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
169170
} State;
170171

171172
EdgeBundles *Bundles = nullptr;
@@ -212,6 +213,13 @@ void MachineSMEABI::collectNeededZAStates(MachineFunction &MF,
212213
MachineBasicBlock::iterator MBBI(MI);
213214
LiveUnits.stepBackward(MI);
214215
LiveRegs PhysLiveRegs = GetPhysLiveRegs();
216+
// The SMEStateAllocPseudo marker is added to a function if the save
217+
// buffer was allocated in SelectionDAG. It marks the end of the
218+
// allocation -- which is a safe point for this pass to insert any TPIDR2
219+
// block setup.
220+
if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
221+
State.AfterSMEProloguePt = MBBI;
222+
}
215223
auto [NeededState, InsertPt] = getInstNeededZAState(
216224
TRI, MI, /*ZALiveAtReturn=*/SMEFnAttrs.hasSharedZAInterface());
217225
assert((InsertPt == MBBI ||
@@ -465,23 +473,25 @@ void MachineSMEABI::emitAllocateLazySaveBuffer(
465473
auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
466474
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
467475
MachineRegisterInfo &MRI = MF.getRegInfo();
476+
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
468477

469478
DebugLoc DL = getDebugLoc(MBB, MBBI);
470479
Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
471480
Register SVL = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
472-
Register Buffer = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
481+
Register Buffer = AFI->getEarlyAllocSMESaveBuffer();
473482

474483
// Calculate SVL.
475484
BuildMI(MBB, MBBI, DL, TII.get(AArch64::RDSVLI_XI), SVL).addImm(1);
476485

477486
// 1. Allocate the lazy save buffer.
478-
{
487+
if (Buffer == AArch64::NoRegister) {
479488
// TODO This function grows the stack with a subtraction, which doesn't work
480489
// on Windows. Some refactoring to share the functionality in
481490
// LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
482491
// supports SME
483492
assert(!Subtarget.isTargetWindows() &&
484493
"Lazy ZA save is not yet supported on Windows");
494+
Buffer = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
485495
// Get original stack pointer.
486496
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), SP).addReg(AArch64::SP);
487497
// Allocate a lazy-save buffer object of the size given, normally SVL * SVL
@@ -632,8 +642,15 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
632642

633643
// Allocate save buffer (if needed).
634644
if (State.TPIDR2Block.has_value()) {
635-
MachineBasicBlock &EntryBlock = MF.front();
636-
emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
645+
if (State.AfterSMEProloguePt) {
646+
// Note: With inline stack probes the AfterSMEProloguePt may not be in the
647+
// entry block (due to the probing loop).
648+
emitAllocateLazySaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
649+
*State.AfterSMEProloguePt);
650+
} else {
651+
MachineBasicBlock &EntryBlock = MF.front();
652+
emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
653+
}
637654
}
638655

639656
return true;
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s
3+
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING
4+
5+
declare void @private_za_callee()
6+
declare void @shared_za_callee() "aarch64_inout_za"
7+
8+
define void @test_lazy_save() nounwind "aarch64_inout_za" {
9+
; CHECK-LABEL: test_lazy_save:
10+
; CHECK: // %bb.0:
11+
; CHECK-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
12+
; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
13+
; CHECK-NEXT: mov x29, sp
14+
; CHECK-NEXT: sub sp, sp, #16
15+
; CHECK-NEXT: rdsvl x8, #1
16+
; CHECK-NEXT: mul x9, x8, x8
17+
; CHECK-NEXT: lsr x15, x9, #4
18+
; CHECK-NEXT: bl __chkstk
19+
; CHECK-NEXT: sub x9, sp, x15, lsl #4
20+
; CHECK-NEXT: mov sp, x9
21+
; CHECK-NEXT: stur x9, [x29, #-16]
22+
; CHECK-NEXT: sub x9, x29, #16
23+
; CHECK-NEXT: sturh wzr, [x29, #-6]
24+
; CHECK-NEXT: stur wzr, [x29, #-4]
25+
; CHECK-NEXT: sturh w8, [x29, #-8]
26+
; CHECK-NEXT: msr TPIDR2_EL0, x9
27+
; CHECK-NEXT: bl private_za_callee
28+
; CHECK-NEXT: smstart za
29+
; CHECK-NEXT: mrs x8, TPIDR2_EL0
30+
; CHECK-NEXT: sub x0, x29, #16
31+
; CHECK-NEXT: cbnz x8, .LBB0_2
32+
; CHECK-NEXT: // %bb.1:
33+
; CHECK-NEXT: bl __arm_tpidr2_restore
34+
; CHECK-NEXT: .LBB0_2:
35+
; CHECK-NEXT: msr TPIDR2_EL0, xzr
36+
; CHECK-NEXT: mov sp, x29
37+
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
38+
; CHECK-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
39+
; CHECK-NEXT: ret
40+
;
41+
; CHECK-NEWLOWERING-LABEL: test_lazy_save:
42+
; CHECK-NEWLOWERING: // %bb.0:
43+
; CHECK-NEWLOWERING-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
44+
; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
45+
; CHECK-NEWLOWERING-NEXT: mov x29, sp
46+
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
47+
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
48+
; CHECK-NEWLOWERING-NEXT: mul x9, x8, x8
49+
; CHECK-NEWLOWERING-NEXT: lsr x15, x9, #4
50+
; CHECK-NEWLOWERING-NEXT: bl __chkstk
51+
; CHECK-NEWLOWERING-NEXT: sub x9, sp, x15, lsl #4
52+
; CHECK-NEWLOWERING-NEXT: mov sp, x9
53+
; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
54+
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
55+
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
56+
; CHECK-NEWLOWERING-NEXT: bl private_za_callee
57+
; CHECK-NEWLOWERING-NEXT: smstart za
58+
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
59+
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
60+
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_2
61+
; CHECK-NEWLOWERING-NEXT: // %bb.1:
62+
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
63+
; CHECK-NEWLOWERING-NEXT: .LBB0_2:
64+
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
65+
; CHECK-NEWLOWERING-NEXT: mov sp, x29
66+
; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
67+
; CHECK-NEWLOWERING-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
68+
; CHECK-NEWLOWERING-NEXT: ret
69+
call void @private_za_callee()
70+
ret void
71+
}

llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,6 @@ exit:
103103
ret float %ret
104104
}
105105

106-
; FIXME: This is missing stack probes with -aarch64-new-sme-abi.
107106
define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" {
108107
; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
109108
; CHECK: // %bb.0:
@@ -165,26 +164,35 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
165164
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
166165
; CHECK-NEWLOWERING-NEXT: mov x9, sp
167166
; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
167+
; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
168+
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
169+
; CHECK-NEWLOWERING-NEXT: cmp sp, x9
170+
; CHECK-NEWLOWERING-NEXT: b.le .LBB2_3
171+
; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
172+
; CHECK-NEWLOWERING-NEXT: str xzr, [sp]
173+
; CHECK-NEWLOWERING-NEXT: b .LBB2_1
174+
; CHECK-NEWLOWERING-NEXT: .LBB2_3:
168175
; CHECK-NEWLOWERING-NEXT: mov sp, x9
176+
; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp]
169177
; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
170178
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
171179
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
172-
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_2
173-
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b
180+
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5
181+
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b
174182
; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000
175183
; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1
176-
; CHECK-NEWLOWERING-NEXT: b .LBB2_3
177-
; CHECK-NEWLOWERING-NEXT: .LBB2_2: // %use_c
184+
; CHECK-NEWLOWERING-NEXT: b .LBB2_6
185+
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %use_c
178186
; CHECK-NEWLOWERING-NEXT: fmov s0, s1
179187
; CHECK-NEWLOWERING-NEXT: bl cosf
180-
; CHECK-NEWLOWERING-NEXT: .LBB2_3: // %exit
188+
; CHECK-NEWLOWERING-NEXT: .LBB2_6: // %exit
181189
; CHECK-NEWLOWERING-NEXT: smstart za
182190
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
183191
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
184-
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_5
185-
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
192+
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_8
193+
; CHECK-NEWLOWERING-NEXT: // %bb.7: // %exit
186194
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
187-
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %exit
195+
; CHECK-NEWLOWERING-NEXT: .LBB2_8: // %exit
188196
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
189197
; CHECK-NEWLOWERING-NEXT: mov sp, x29
190198
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload

0 commit comments

Comments
 (0)