diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 57dcd68595ff1..79655e1c9529c 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1688,6 +1688,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, } case AArch64::InOutZAUsePseudo: case AArch64::RequiresZASavePseudo: + case AArch64::SMEStateAllocPseudo: case AArch64::COALESCER_BARRIER_FPR16: case AArch64::COALESCER_BARRIER_FPR32: case AArch64::COALESCER_BARRIER_FPR64: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b8335113e4687..ca71205205b53 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8291,7 +8291,39 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Subtarget->hasCustomCallingConv()) Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); - if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) { + if (getTM().useNewSMEABILowering() && !Attrs.hasAgnosticZAInterface()) { + if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) { + SDValue Size; + if (Attrs.hasZAState()) { + SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); + } else if (Attrs.hasAgnosticZAInterface()) { + RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE; + SDValue Callee = DAG.getExternalSymbol( + getLibcallName(LC), getPointerTy(DAG.getDataLayout())); + auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext()); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( + getLibcallCallingConv(LC), RetTy, Callee, {}); + std::tie(Size, Chain) = LowerCallTo(CLI); + } + if (Size) { + SDValue Buffer = DAG.getNode( + ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other), + {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); + Chain = Buffer.getValue(1); + + Register BufferPtr = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer); + Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL, + DAG.getVTList(MVT::Other), Chain); + FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr); + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + } + } else { // Old SME ABI lowering (deprecated): // Create a 16 Byte TPIDR2 object. The dynamic buffer // will be expanded and stored in the static object later using a diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 1fde87e65a34b..31bd72bfa77a0 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -238,6 +238,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // Holds the SME function attributes (streaming mode, ZA/ZT0 state). SMEAttrs SMEFnAttrs; + // Holds the TPIDR2 block if allocated early (for Windows/stack probes + // support). + Register EarlyAllocSMESaveBuffer = AArch64::NoRegister; + // Note: The following properties are only used for the old SME ABI lowering: /// The frame-index for the TPIDR2 object used for lazy saves. TPIDR2Object TPIDR2; @@ -256,6 +260,12 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { const DenseMap &Src2DstMBB) const override; + void setEarlyAllocSMESaveBuffer(Register Ptr) { + EarlyAllocSMESaveBuffer = Ptr; + } + + Register getEarlyAllocSMESaveBuffer() { return EarlyAllocSMESaveBuffer; } + // Old SME ABI lowering state getters/setters: Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; }; void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; }; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 0d8cb3a76d0be..601dc34d74b9c 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -93,6 +93,8 @@ let hasSideEffects = 1, isMeta = 1 in { def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; } +def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>; + def CommitZASavePseudo : Pseudo<(outs), (ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>, @@ -108,6 +110,11 @@ def AArch64_requires_za_save [SDNPHasChain, SDNPInGlue]>; def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>; +def AArch64_sme_state_alloc + : SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>, + [SDNPHasChain]>; +def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>; + //===----------------------------------------------------------------------===// // Instruction naming conventions. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index b58dfdf32e4ab..e8cbeacb98192 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -249,6 +249,7 @@ struct MachineSMEABI : public MachineFunctionPass { SmallVector Blocks; SmallVector BundleStates; std::optional TPIDR2Block; + std::optional AfterSMEProloguePt; } State; MachineFunction *MF = nullptr; @@ -298,6 +299,13 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { MachineBasicBlock::iterator MBBI(MI); LiveUnits.stepBackward(MI); LiveRegs PhysLiveRegs = GetPhysLiveRegs(); + // The SMEStateAllocPseudo marker is added to a function if the save + // buffer was allocated in SelectionDAG. It marks the end of the + // allocation -- which is a safe point for this pass to insert any TPIDR2 + // block setup. + if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) { + State.AfterSMEProloguePt = MBBI; + } auto [NeededState, InsertPt] = getZAStateBeforeInst( *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface()); assert((InsertPt == MBBI || @@ -529,23 +537,27 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB, void MachineSMEABI::emitAllocateLazySaveBuffer( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { MachineFrameInfo &MFI = MF->getFrameInfo(); + auto *AFI = MF->getInfo(); DebugLoc DL = getDebugLoc(MBB, MBBI); Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass); Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass); - Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + Register Buffer = AFI->getEarlyAllocSMESaveBuffer(); // Calculate SVL. BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1); // 1. Allocate the lazy save buffer. - { - // TODO This function grows the stack with a subtraction, which doesn't work - // on Windows. Some refactoring to share the functionality in - // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI - // supports SME + if (Buffer == AArch64::NoRegister) { + // TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so + // Buffer != AArch64::NoRegister). This is done to reuse the existing + // expansions (which can insert stack checks). This works, but it means we + // will always allocate the lazy save buffer (even if the function contains + // no lazy saves). If we want to handle Windows here, we'll need to + // implement something similar to LowerWindowsDYNAMIC_STACKALLOC. assert(!Subtarget->isTargetWindows() && "Lazy ZA save is not yet supported on Windows"); + Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass); // Get original stack pointer. BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP) .addReg(AArch64::SP); @@ -686,8 +698,15 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { // Allocate save buffer (if needed). if (State.TPIDR2Block) { - MachineBasicBlock &EntryBlock = MF.front(); - emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI()); + if (State.AfterSMEProloguePt) { + // Note: With inline stack probes the AfterSMEProloguePt may not be in the + // entry block (due to the probing loop). + emitAllocateLazySaveBuffer(*(*State.AfterSMEProloguePt)->getParent(), + *State.AfterSMEProloguePt); + } else { + MachineBasicBlock &EntryBlock = MF.front(); + emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI()); + } } return true; diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll new file mode 100644 index 0000000000000..1c341e8daf491 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s + +declare void @private_za_callee() +declare void @shared_za_callee() "aarch64_inout_za" + +define void @test_lazy_save() nounwind "aarch64_inout_za" { +; CHECK-LABEL: test_lazy_save: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: lsr x15, x9, #4 +; CHECK-NEXT: bl __chkstk +; CHECK-NEXT: sub x9, sp, x15, lsl #4 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: bl private_za_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB0_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @private_za_callee() + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index 4ab553d79405d..066ee3b040469 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -99,7 +99,6 @@ exit: ret float %ret } -; FIXME: This is missing stack probes with -aarch64-new-sme-abi. define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" { ; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe: ; CHECK: // %bb.0: @@ -157,26 +156,35 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 ; CHECK-NEWLOWERING-NEXT: mov x9, sp ; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEWLOWERING-NEXT: cmp sp, x9 +; CHECK-NEWLOWERING-NEXT: b.le .LBB2_3 +; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 +; CHECK-NEWLOWERING-NEXT: str xzr, [sp] +; CHECK-NEWLOWERING-NEXT: b .LBB2_1 +; CHECK-NEWLOWERING-NEXT: .LBB2_3: ; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp] ; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 ; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b +; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5 +; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b ; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000 ; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1 -; CHECK-NEWLOWERING-NEXT: b .LBB2_3 -; CHECK-NEWLOWERING-NEXT: .LBB2_2: // %use_c +; CHECK-NEWLOWERING-NEXT: b .LBB2_6 +; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %use_c ; CHECK-NEWLOWERING-NEXT: fmov s0, s1 ; CHECK-NEWLOWERING-NEXT: bl cosf -; CHECK-NEWLOWERING-NEXT: .LBB2_3: // %exit +; CHECK-NEWLOWERING-NEXT: .LBB2_6: // %exit ; CHECK-NEWLOWERING-NEXT: smstart za ; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_5 -; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_8 +; CHECK-NEWLOWERING-NEXT: // %bb.7: // %exit ; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %exit +; CHECK-NEWLOWERING-NEXT: .LBB2_8: // %exit ; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEWLOWERING-NEXT: mov sp, x29 ; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload