diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index 5496ebd495a55..8d0ff41fc8c08 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -60,6 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); FunctionPass *createSMEABIPass(); FunctionPass *createSMEPeepholeOptPass(); +FunctionPass *createMachineSMEABIPass(); ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, @@ -111,6 +112,7 @@ void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); void initializeSMEABIPass(PassRegistry &); void initializeSMEPeepholeOptPass(PassRegistry &); +void initializeMachineSMEABIPass(PassRegistry &); void initializeSVEIntrinsicOptsPass(PassRegistry &); void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &); } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 1d27e2776cbaf..18e5ea971f9c4 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -92,8 +92,9 @@ class AArch64ExpandPseudo : public MachineFunctionPass { bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); - MachineBasicBlock *expandRestoreZA(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI); + MachineBasicBlock * + expandCommitOrRestoreZASave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); MachineBasicBlock *expandCondSMToggle(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); }; @@ -990,10 +991,15 @@ bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext( return true; } -MachineBasicBlock * -AArch64ExpandPseudo::expandRestoreZA(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) { +static constexpr unsigned ZERO_ALL_ZA_MASK = 0b11111111; + +MachineBasicBlock *AArch64ExpandPseudo::expandCommitOrRestoreZASave( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { MachineInstr &MI = *MBBI; + bool IsRestoreZA = MI.getOpcode() == AArch64::RestoreZAPseudo; + assert((MI.getOpcode() == AArch64::RestoreZAPseudo || + MI.getOpcode() == AArch64::CommitZASavePseudo) && + "Expected ZA commit or restore"); assert((std::next(MBBI) != MBB.end() || MI.getParent()->successors().begin() != MI.getParent()->successors().end()) && @@ -1001,21 +1007,23 @@ AArch64ExpandPseudo::expandRestoreZA(MachineBasicBlock &MBB, // Compare TPIDR2_EL0 value against 0. DebugLoc DL = MI.getDebugLoc(); - MachineInstrBuilder Cbz = BuildMI(MBB, MBBI, DL, TII->get(AArch64::CBZX)) - .add(MI.getOperand(0)); + MachineInstrBuilder Branch = + BuildMI(MBB, MBBI, DL, + TII->get(IsRestoreZA ? AArch64::CBZX : AArch64::CBNZX)) + .add(MI.getOperand(0)); // Split MBB and create two new blocks: // - MBB now contains all instructions before RestoreZAPseudo. - // - SMBB contains the RestoreZAPseudo instruction only. - // - EndBB contains all instructions after RestoreZAPseudo. + // - SMBB contains the [Commit|RestoreZA]Pseudo instruction only. + // - EndBB contains all instructions after [Commit|RestoreZA]Pseudo. MachineInstr &PrevMI = *std::prev(MBBI); MachineBasicBlock *SMBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true); MachineBasicBlock *EndBB = std::next(MI.getIterator()) == SMBB->end() ? *SMBB->successors().begin() : SMBB->splitAt(MI, /*UpdateLiveIns*/ true); - // Add the SMBB label to the TB[N]Z instruction & create a branch to EndBB. - Cbz.addMBB(SMBB); + // Add the SMBB label to the CB[N]Z instruction & create a branch to EndBB. + Branch.addMBB(SMBB); BuildMI(&MBB, DL, TII->get(AArch64::B)) .addMBB(EndBB); MBB.addSuccessor(EndBB); @@ -1023,11 +1031,29 @@ AArch64ExpandPseudo::expandRestoreZA(MachineBasicBlock &MBB, // Replace the pseudo with a call (BL). MachineInstrBuilder MIB = BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::BL)); - MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit); + // Copy operands (mainly the regmask) from the pseudo. for (unsigned I = 2; I < MI.getNumOperands(); ++I) MIB.add(MI.getOperand(I)); - BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB); + if (IsRestoreZA) { + // Mark the TPIDR2 block pointer (X0) as an implicit use. + MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit); + } else /*CommitZA*/ { + auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo(); + // Clear TPIDR2_EL0. + BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::MSR)) + .addImm(AArch64SysReg::TPIDR2_EL0) + .addReg(AArch64::XZR); + bool ZeroZA = MI.getOperand(1).getImm() != 0; + if (ZeroZA) { + assert(MI.definesRegister(AArch64::ZAB0, TRI) && "should define ZA!"); + BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::ZERO_M)) + .addImm(ZERO_ALL_ZA_MASK) + .addDef(AArch64::ZAB0, RegState::ImplicitDefine); + } + } + + BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB); MI.eraseFromParent(); return EndBB; } @@ -1646,8 +1672,9 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, return expandCALL_BTI(MBB, MBBI); case AArch64::StoreSwiftAsyncContext: return expandStoreSwiftAsyncContext(MBB, MBBI); + case AArch64::CommitZASavePseudo: case AArch64::RestoreZAPseudo: { - auto *NewMBB = expandRestoreZA(MBB, MBBI); + auto *NewMBB = expandCommitOrRestoreZASave(MBB, MBBI); if (NewMBB != &MBB) NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated. return true; @@ -1658,6 +1685,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated. return true; } + case AArch64::InOutZAUsePseudo: + case AArch64::RequiresZASavePseudo: case AArch64::COALESCER_BARRIER_FPR16: case AArch64::COALESCER_BARRIER_FPR32: case AArch64::COALESCER_BARRIER_FPR64: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c27bf82157393..80e15e159c6ff 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17,6 +17,7 @@ #include "AArch64PerfectShuffle.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" #include "Utils/AArch64SMEAttributes.h" @@ -1998,6 +1999,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(Op, MVT::f16, Promote); } +const AArch64TargetMachine &AArch64TargetLowering::getTM() const { + return static_cast(getTargetMachine()); +} + void AArch64TargetLowering::addTypeForNEON(MVT VT) { assert(VT.isVector() && "VT should be a vector type"); @@ -8284,53 +8289,54 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Subtarget->hasCustomCallingConv()) Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); - // Create a 16 Byte TPIDR2 object. The dynamic buffer - // will be expanded and stored in the static object later using a pseudonode. - if (Attrs.hasZAState()) { - TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); - TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false); - SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - - SDValue Buffer; - if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { - Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL, - DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL}); - } else { - SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); - Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, - DAG.getVTList(MVT::i64, MVT::Other), - {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); - MFI.CreateVariableSizedObject(Align(16), nullptr); - } - Chain = DAG.getNode( - AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other), - {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)}); - } else if (Attrs.hasAgnosticZAInterface()) { - // Call __arm_sme_state_size(). - SDValue BufferSize = - DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL, - DAG.getVTList(MVT::i64, MVT::Other), Chain); - Chain = BufferSize.getValue(1); - - SDValue Buffer; - if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { - Buffer = - DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL, - DAG.getVTList(MVT::i64, MVT::Other), {Chain, BufferSize}); - } else { - // Allocate space dynamically. - Buffer = DAG.getNode( - ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other), - {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)}); - MFI.CreateVariableSizedObject(Align(16), nullptr); + if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) { + // Old SME ABI lowering (deprecated): + // Create a 16 Byte TPIDR2 object. The dynamic buffer + // will be expanded and stored in the static object later using a + // pseudonode. + if (Attrs.hasZAState()) { + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); + TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false); + SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + SDValue Buffer; + if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { + Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL, + DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL}); + } else { + SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); + Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, + DAG.getVTList(MVT::i64, MVT::Other), + {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + Chain = DAG.getNode( + AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other), + {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)}); + } else if (Attrs.hasAgnosticZAInterface()) { + // Call __arm_sme_state_size(). + SDValue BufferSize = + DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL, + DAG.getVTList(MVT::i64, MVT::Other), Chain); + Chain = BufferSize.getValue(1); + SDValue Buffer; + if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { + Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL, + DAG.getVTList(MVT::i64, MVT::Other), + {Chain, BufferSize}); + } else { + // Allocate space dynamically. + Buffer = DAG.getNode( + ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other), + {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)}); + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + // Copy the value to a virtual register, and save that in FuncInfo. + Register BufferPtr = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + FuncInfo->setSMESaveBufferAddr(BufferPtr); + Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer); } - - // Copy the value to a virtual register, and save that in FuncInfo. - Register BufferPtr = - MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); - FuncInfo->setSMESaveBufferAddr(BufferPtr); - Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer); } if (CallConv == CallingConv::PreserveNone) { @@ -8347,6 +8353,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } } + if (getTM().useNewSMEABILowering()) { + // Clear new ZT0 state. TODO: Move this to the SME ABI pass. + if (Attrs.isNewZT0()) + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, + DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32)); + } + return Chain; } @@ -8918,7 +8933,6 @@ static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setSMESaveBufferUsed(); - TargetLowering::ArgListTy Args; Args.emplace_back( DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64), @@ -9059,14 +9073,28 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, CallConv = CallingConv::AArch64_SVE_VectorCall; } + // Determine whether we need any streaming mode changes. + SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI); + bool UseNewSMEABILowering = getTM().useNewSMEABILowering(); + bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface(); + auto ZAMarkerNode = [&]() -> std::optional { + // TODO: Handle agnostic ZA functions. + if (!UseNewSMEABILowering || IsAgnosticZAFunction) + return std::nullopt; + if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State()) + return std::nullopt; + return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE + : AArch64ISD::INOUT_ZA_USE; + }(); + if (IsTailCall) { // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization(CLI); // A sibling call is one where we're under the usual C ABI and not planning // to change that but can still do a tail call: - if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail && - CallConv != CallingConv::SwiftTail) + if (!ZAMarkerNode && !TailCallOpt && IsTailCall && + CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail) IsSibCall = true; if (IsTailCall) @@ -9118,9 +9146,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); } - // Determine whether we need any streaming mode changes. - SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI); - auto DescribeCallsite = [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & { R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '"; @@ -9134,7 +9159,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, return R; }; - bool RequiresLazySave = CallAttrs.requiresLazySave(); + bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave(); bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState(); if (RequiresLazySave) { const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); @@ -9209,10 +9234,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32)); - // Adjust the stack pointer for the new arguments... + // Adjust the stack pointer for the new arguments... and mark ZA uses. // These operations are automatically eliminated by the prolog/epilog pass - if (!IsSibCall) + assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START"); + if (!IsSibCall) { Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL); + if (ZAMarkerNode) { + // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply + // using a chain can result in incorrect scheduling. The markers refer to + // the position just before the CALLSEQ_START (though occur after as + // CALLSEQ_START lacks in-glue). + Chain = DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other), + {Chain, Chain.getValue(1)}); + } + } SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy(DAG.getDataLayout())); @@ -9683,7 +9718,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } } - if (CallAttrs.requiresEnablingZAAfterCall()) + if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall()) // Unconditionally resume ZA. Result = DAG.getNode( AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result, @@ -9705,7 +9740,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue TPIDR2_EL0 = DAG.getNode( ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); - // Copy the address of the TPIDR2 block into X0 before 'calling' the // RESTORE_ZA pseudo. SDValue Glue; @@ -9717,7 +9751,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64), RestoreRoutine, RegMask, Result.getValue(1)}); - // Finally reset the TPIDR2_EL0 register to 0. Result = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Result, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 78d6a507b80d3..071e96e194286 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -23,6 +23,8 @@ namespace llvm { +class AArch64TargetMachine; + namespace AArch64 { /// Possible values of current rounding mode, which is specified in bits /// 23:22 of FPCR. @@ -64,6 +66,8 @@ class AArch64TargetLowering : public TargetLowering { explicit AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI); + const AArch64TargetMachine &getTM() const; + /// Control the following reassociation of operands: (op (op x, c1), y) -> (op /// (op x, y), c1) where N0 is (op x, c1) and N1 is y. bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, @@ -173,6 +177,10 @@ class AArch64TargetLowering : public TargetLowering { MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const; MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; + + // Note: The following group of functions are only used as part of the old SME + // ABI lowering. They will be removed once -aarch64-new-sme-abi=true is the + // default. MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitAllocateZABuffer(MachineInstr &MI, diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 76009809bf725..ed3374ae68d00 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -213,9 +213,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// or return type bool IsSVECC = false; - /// The frame-index for the TPIDR2 object used for lazy saves. - TPIDR2Object TPIDR2; - /// Whether this function changes streaming mode within the function. bool HasStreamingModeChanges = false; @@ -234,14 +231,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // true if PStateSMReg is used. bool PStateSMRegUsed = false; - // Holds a pointer to a buffer that is large enough to represent - // all SME ZA state and any additional state required by the - // __arm_sme_save/restore support routines. - Register SMESaveBufferAddr = MCRegister::NoRegister; - - // true if SMESaveBufferAddr is used. - bool SMESaveBufferUsed = false; - // Has the PNReg used to build PTRUE instruction. // The PTRUE is used for the LD/ST of ZReg pairs in save and restore. unsigned PredicateRegForFillSpill = 0; @@ -253,6 +242,16 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // Holds the SME function attributes (streaming mode, ZA/ZT0 state). SMEAttrs SMEFnAttrs; + // Note: The following properties are only used for the old SME ABI lowering: + /// The frame-index for the TPIDR2 object used for lazy saves. + TPIDR2Object TPIDR2; + // Holds a pointer to a buffer that is large enough to represent + // all SME ZA state and any additional state required by the + // __arm_sme_save/restore support routines. + Register SMESaveBufferAddr = MCRegister::NoRegister; + // true if SMESaveBufferAddr is used. + bool SMESaveBufferUsed = false; + public: AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI); @@ -261,6 +260,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { const DenseMap &Src2DstMBB) const override; + // Old SME ABI lowering state getters/setters: + Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; }; + void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; }; + unsigned isSMESaveBufferUsed() const { return SMESaveBufferUsed; }; + void setSMESaveBufferUsed(bool Used = true) { SMESaveBufferUsed = Used; }; + TPIDR2Object &getTPIDR2Obj() { return TPIDR2; } + void setPredicateRegForFillSpill(unsigned Reg) { PredicateRegForFillSpill = Reg; } @@ -268,12 +274,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { return PredicateRegForFillSpill; } - Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; }; - void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; }; - - unsigned isSMESaveBufferUsed() const { return SMESaveBufferUsed; }; - void setSMESaveBufferUsed(bool Used = true) { SMESaveBufferUsed = Used; }; - Register getPStateSMReg() const { return PStateSMReg; }; void setPStateSMReg(Register Reg) { PStateSMReg = Reg; }; @@ -289,8 +289,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { bool isSVECC() const { return IsSVECC; }; void setIsSVECC(bool s) { IsSVECC = s; }; - TPIDR2Object &getTPIDR2Obj() { return TPIDR2; } - void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI); unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 9c20087159d17..5c4e0c1093187 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -54,6 +54,10 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>, def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>, [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; +//===----------------------------------------------------------------------===// +// Old SME ABI lowering ISD nodes/pseudos (deprecated) +//===----------------------------------------------------------------------===// + def AArch64AllocateZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>, [SDNPHasChain, SDNPSideEffect]>; @@ -86,6 +90,30 @@ let usesCustomInserter = 1, Defs = [SP] in { def : Pat<(i64 (AArch64AllocateSMESaveBuffer GPR64:$size)), (AllocateSMESaveBuffer $size)>; +//===----------------------------------------------------------------------===// +// New SME ABI lowering ISD nodes/pseudos (-aarch64-new-sme-abi) +//===----------------------------------------------------------------------===// + +let hasSideEffects = 1, isMeta = 1 in { + def InOutZAUsePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; + def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; +} + +def CommitZASavePseudo + : Pseudo<(outs), + (ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>, + Sched<[]>; + +def AArch64_inout_za_use + : SDNode<"AArch64ISD::INOUT_ZA_USE", SDTypeProfile<0, 0,[]>, + [SDNPHasChain, SDNPInGlue]>; +def : Pat<(AArch64_inout_za_use), (InOutZAUsePseudo)>; + +def AArch64_requires_za_save + : SDNode<"AArch64ISD::REQUIRES_ZA_SAVE", SDTypeProfile<0, 0,[]>, + [SDNPHasChain, SDNPInGlue]>; +def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>; + //===----------------------------------------------------------------------===// // Instruction naming conventions. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 95eab16511e5a..e67bd5869ccd1 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -224,6 +224,11 @@ static cl::opt cl::desc("Enable Machine Pipeliner for AArch64"), cl::init(false), cl::Hidden); +static cl::opt + EnableNewSMEABILowering("aarch64-new-sme-abi", + cl::desc("Enable new lowering for the SME ABI"), + cl::init(false), cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { // Register the target. @@ -263,6 +268,7 @@ LLVMInitializeAArch64Target() { initializeLDTLSCleanupPass(PR); initializeKCFIPass(PR); initializeSMEABIPass(PR); + initializeMachineSMEABIPass(PR); initializeSMEPeepholeOptPass(PR); initializeSVEIntrinsicOptsPass(PR); initializeAArch64SpeculationHardeningPass(PR); @@ -367,7 +373,8 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, computeDefaultCPU(TT, CPU), FS, Options, getEffectiveRelocModel(TT, RM), getEffectiveAArch64CodeModel(TT, CM, JIT), OL), - TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) { + TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian), + UseNewSMEABILowering(EnableNewSMEABILowering) { initAsmInfo(); if (TT.isOSBinFormatMachO()) { @@ -668,10 +675,12 @@ void AArch64PassConfig::addIRPasses() { addPass(createInterleavedAccessPass()); } - // Expand any functions marked with SME attributes which require special - // changes for the calling convention or that require the lazy-saving - // mechanism specified in the SME ABI. - addPass(createSMEABIPass()); + if (!EnableNewSMEABILowering) { + // Expand any functions marked with SME attributes which require special + // changes for the calling convention or that require the lazy-saving + // mechanism specified in the SME ABI. + addPass(createSMEABIPass()); + } // Add Control Flow Guard checks. if (TM->getTargetTriple().isOSWindows()) { @@ -782,6 +791,9 @@ bool AArch64PassConfig::addGlobalInstructionSelect() { } void AArch64PassConfig::addMachineSSAOptimization() { + if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None) + addPass(createMachineSMEABIPass()); + if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt) addPass(createSMEPeepholeOptPass()); @@ -812,6 +824,9 @@ bool AArch64PassConfig::addILPOpts() { } void AArch64PassConfig::addPreRegAlloc() { + if (TM->getOptLevel() == CodeGenOptLevel::None && EnableNewSMEABILowering) + addPass(createMachineSMEABIPass()); + // Change dead register definitions to refer to the zero register. if (TM->getOptLevel() != CodeGenOptLevel::None && EnableDeadRegisterElimination) diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h index b9e522dd6f226..0dd5d95b19b43 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -79,8 +79,12 @@ class AArch64TargetMachine : public CodeGenTargetMachineImpl { size_t clearLinkerOptimizationHints( const SmallPtrSetImpl &MIs) const override; + /// Returns true if the new SME ABI lowering should be used. + bool useNewSMEABILowering() const { return UseNewSMEABILowering; } + private: bool isLittle; + bool UseNewSMEABILowering; }; // AArch64 little endian target machine. diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 66136a464f05d..803943fd57c4d 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -89,6 +89,7 @@ add_llvm_target(AArch64CodeGen SMEABIPass.cpp SMEPeepholeOpt.cpp SVEIntrinsicOpts.cpp + MachineSMEABIPass.cpp AArch64SIMDInstrOpt.cpp DEPENDS diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp new file mode 100644 index 0000000000000..f2502fd736931 --- /dev/null +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -0,0 +1,696 @@ +//===- MachineSMEABIPass.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements the SME ABI requirements for ZA state. This includes +// implementing the lazy ZA state save schemes around calls. +// +//===----------------------------------------------------------------------===// +// +// This pass works by collecting instructions that require ZA to be in a +// specific state (e.g., "ACTIVE" or "SAVED") and inserting the necessary state +// transitions to ensure ZA is in the required state before instructions. State +// transitions represent actions such as setting up or restoring a lazy save. +// Certain points within a function may also have predefined states independent +// of any instructions, for example, a "shared_za" function is always entered +// and exited in the "ACTIVE" state. +// +// To handle ZA state across control flow, we make use of edge bundling. This +// assigns each block an "incoming" and "outgoing" edge bundle (representing +// incoming and outgoing edges). Initially, these are unique to each block; +// then, in the process of forming bundles, the outgoing block of a block is +// joined with the incoming bundle of all successors. The result is that each +// bundle can be assigned a single ZA state, which ensures the state required by +// all a blocks' successors is the same, and that each basic block will always +// be entered with the same ZA state. This eliminates the need for splitting +// edges to insert state transitions or "phi" nodes for ZA states. +// +// See below for a simple example of edge bundling. +// +// The following shows a conditionally executed basic block (BB1): +// +// if (cond) +// BB1 +// BB2 +// +// Initial Bundles Joined Bundles +// +// ┌──0──┐ ┌──0──┐ +// │ BB0 │ │ BB0 │ +// └──1──┘ └──1──┘ +// ├───────┐ ├───────┐ +// ▼ │ ▼ │ +// ┌──2──┐ │ ─────► ┌──1──┐ │ +// │ BB1 │ ▼ │ BB1 │ ▼ +// └──3──┘ ┌──4──┐ └──1──┘ ┌──1──┐ +// └───►4 BB2 │ └───►1 BB2 │ +// └──5──┘ └──2──┘ +// +// On the left are the initial per-block bundles, and on the right are the +// joined bundles (which are the result of the EdgeBundles analysis). + +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/BitmaskEnum.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/EdgeBundles.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-machine-sme-abi" + +namespace { + +enum ZAState { + // Any/unknown state (not valid) + ANY = 0, + + // ZA is in use and active (i.e. within the accumulator) + ACTIVE, + + // A ZA save has been set up or committed (i.e. ZA is dormant or off) + LOCAL_SAVED, + + // ZA is off or a lazy save has been set up by the caller + CALLER_DORMANT, + + // ZA is off + OFF, + + // The number of ZA states (not a valid state) + NUM_ZA_STATE +}; + +/// A bitmask enum to record live physical registers that the "emit*" routines +/// may need to preserve. Note: This only tracks registers we may clobber. +enum LiveRegs : uint8_t { + None = 0, + NZCV = 1 << 0, + W0 = 1 << 1, + W0_HI = 1 << 2, + X0 = W0 | W0_HI, + LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ W0_HI) +}; + +/// Holds the virtual registers live physical registers have been saved to. +struct PhysRegSave { + LiveRegs PhysLiveRegs; + Register StatusFlags = AArch64::NoRegister; + Register X0Save = AArch64::NoRegister; +}; + +static bool isLegalEdgeBundleZAState(ZAState State) { + switch (State) { + case ZAState::ACTIVE: + case ZAState::LOCAL_SAVED: + return true; + default: + return false; + } +} +struct TPIDR2State { + int FrameIndex = -1; +}; + +StringRef getZAStateString(ZAState State) { +#define MAKE_CASE(V) \ + case V: \ + return #V; + switch (State) { + MAKE_CASE(ZAState::ANY) + MAKE_CASE(ZAState::ACTIVE) + MAKE_CASE(ZAState::LOCAL_SAVED) + MAKE_CASE(ZAState::CALLER_DORMANT) + MAKE_CASE(ZAState::OFF) + default: + llvm_unreachable("Unexpected ZAState"); + } +#undef MAKE_CASE +} + +static bool isZAorZT0RegOp(const TargetRegisterInfo &TRI, + const MachineOperand &MO) { + if (!MO.isReg() || !MO.getReg().isPhysical()) + return false; + return any_of(TRI.subregs_inclusive(MO.getReg()), [](const MCPhysReg &SR) { + return AArch64::MPR128RegClass.contains(SR) || + AArch64::ZTRRegClass.contains(SR); + }); +} + +/// Returns the required ZA state needed before \p MI and an iterator pointing +/// to where any code required to change the ZA state should be inserted. +static std::pair +getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI, + bool ZAOffAtReturn) { + MachineBasicBlock::iterator InsertPt(MI); + + if (MI.getOpcode() == AArch64::InOutZAUsePseudo) + return {ZAState::ACTIVE, std::prev(InsertPt)}; + + if (MI.getOpcode() == AArch64::RequiresZASavePseudo) + return {ZAState::LOCAL_SAVED, std::prev(InsertPt)}; + + if (MI.isReturn()) + return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt}; + + for (auto &MO : MI.operands()) { + if (isZAorZT0RegOp(TRI, MO)) + return {ZAState::ACTIVE, InsertPt}; + } + + return {ZAState::ANY, InsertPt}; +} + +struct MachineSMEABI : public MachineFunctionPass { + inline static char ID = 0; + + MachineSMEABI() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "Machine SME ABI pass"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + /// Collects the needed ZA state (and live registers) before each instruction + /// within the machine function. + void collectNeededZAStates(SMEAttrs); + + /// Assigns each edge bundle a ZA state based on the needed states of blocks + /// that have incoming or outgoing edges in that bundle. + void assignBundleZAStates(); + + /// Inserts code to handle changes between ZA states within the function. + /// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA. + void insertStateChanges(); + + // Emission routines for private and shared ZA functions (using lazy saves). + void emitNewZAPrologue(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + void emitRestoreLazySave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs); + void emitSetupLazySave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + void emitAllocateLazySaveBuffer(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + bool ClearTPIDR2); + + void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + ZAState From, ZAState To, LiveRegs PhysLiveRegs); + + /// Save live physical registers to virtual registers. + PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL); + /// Restore physical registers from a save of their previous values. + void restorePhyRegSave(PhysRegSave const &RegSave, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL); + + /// Get or create a TPIDR2 block in this function. + TPIDR2State getTPIDR2Block(); + +private: + /// Contains the needed ZA state (and live registers) at an instruction. + struct InstInfo { + ZAState NeededState{ZAState::ANY}; + MachineBasicBlock::iterator InsertPt; + LiveRegs PhysLiveRegs = LiveRegs::None; + }; + + /// Contains the needed ZA state for each instruction in a block. + /// Instructions that do not require a ZA state are not recorded. + struct BlockInfo { + ZAState FixedEntryState{ZAState::ANY}; + SmallVector Insts; + LiveRegs PhysLiveRegsAtExit = LiveRegs::None; + }; + + // All pass state that must be cleared between functions. + struct PassState { + SmallVector Blocks; + SmallVector BundleStates; + std::optional TPIDR2Block; + } State; + + MachineFunction *MF = nullptr; + EdgeBundles *EdgeBundles = nullptr; + const AArch64Subtarget *Subtarget = nullptr; + const AArch64RegisterInfo *TRI = nullptr; + const TargetInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; +}; + +void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { + assert((SMEFnAttrs.hasZT0State() || SMEFnAttrs.hasZAState()) && + "Expected function to have ZA/ZT0 state!"); + + State.Blocks.resize(MF->getNumBlockIDs()); + for (MachineBasicBlock &MBB : *MF) { + BlockInfo &Block = State.Blocks[MBB.getNumber()]; + if (&MBB == &MF->front()) { + // Entry block: + Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface() + ? ZAState::CALLER_DORMANT + : ZAState::ACTIVE; + } else if (MBB.isEHPad()) { + // EH entry block: + Block.FixedEntryState = ZAState::LOCAL_SAVED; + } + + LiveRegUnits LiveUnits(*TRI); + LiveUnits.addLiveOuts(MBB); + + auto GetPhysLiveRegs = [&] { + LiveRegs PhysLiveRegs = LiveRegs::None; + if (!LiveUnits.available(AArch64::NZCV)) + PhysLiveRegs |= LiveRegs::NZCV; + // We have to track W0 and X0 separately as otherwise things can get + // confused if we attempt to preserve X0 but only W0 was defined. + if (!LiveUnits.available(AArch64::W0)) + PhysLiveRegs |= LiveRegs::W0; + if (!LiveUnits.available(AArch64::W0_HI)) + PhysLiveRegs |= LiveRegs::W0_HI; + return PhysLiveRegs; + }; + + Block.PhysLiveRegsAtExit = GetPhysLiveRegs(); + auto FirstTerminatorInsertPt = MBB.getFirstTerminator(); + for (MachineInstr &MI : reverse(MBB)) { + MachineBasicBlock::iterator MBBI(MI); + LiveUnits.stepBackward(MI); + LiveRegs PhysLiveRegs = GetPhysLiveRegs(); + auto [NeededState, InsertPt] = getZAStateBeforeInst( + *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface()); + assert((InsertPt == MBBI || + InsertPt->getOpcode() == AArch64::ADJCALLSTACKDOWN) && + "Unexpected state change insertion point!"); + // TODO: Do something to avoid state changes where NZCV is live. + if (MBBI == FirstTerminatorInsertPt) + Block.PhysLiveRegsAtExit = PhysLiveRegs; + if (NeededState != ZAState::ANY) + Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs}); + } + + // Reverse vector (as we had to iterate backwards for liveness). + std::reverse(Block.Insts.begin(), Block.Insts.end()); + } +} + +void MachineSMEABI::assignBundleZAStates() { + State.BundleStates.resize(EdgeBundles->getNumBundles()); + for (unsigned I = 0, E = EdgeBundles->getNumBundles(); I != E; ++I) { + LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n'); + + // Attempt to assign a ZA state for this bundle that minimizes state + // transitions. Edges within loops are given a higher weight as we assume + // they will be executed more than once. + // TODO: We should propagate desired incoming/outgoing states through blocks + // that have the "ANY" state first to make better global decisions. + int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0}; + for (unsigned BlockID : EdgeBundles->getBlocks(I)) { + LLVM_DEBUG(dbgs() << "- bb." << BlockID); + + const BlockInfo &Block = State.Blocks[BlockID]; + if (Block.Insts.empty()) { + LLVM_DEBUG(dbgs() << " (no state preference)\n"); + continue; + } + bool InEdge = EdgeBundles->getBundle(BlockID, /*Out=*/false) == I; + bool OutEdge = EdgeBundles->getBundle(BlockID, /*Out=*/true) == I; + + ZAState DesiredIncomingState = Block.Insts.front().NeededState; + if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) { + EdgeStateCounts[DesiredIncomingState]++; + LLVM_DEBUG(dbgs() << " DesiredIncomingState: " + << getZAStateString(DesiredIncomingState)); + } + ZAState DesiredOutgoingState = Block.Insts.back().NeededState; + if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) { + EdgeStateCounts[DesiredOutgoingState]++; + LLVM_DEBUG(dbgs() << " DesiredOutgoingState: " + << getZAStateString(DesiredOutgoingState)); + } + LLVM_DEBUG(dbgs() << '\n'); + } + + ZAState BundleState = + ZAState(max_element(EdgeStateCounts) - EdgeStateCounts); + + // Force ZA to be active in bundles that don't have a preferred state. + // TODO: Something better here (to avoid extra mode switches). + if (BundleState == ZAState::ANY) + BundleState = ZAState::ACTIVE; + + LLVM_DEBUG({ + dbgs() << "Chosen ZA state: " << getZAStateString(BundleState) << '\n' + << "Edge counts:"; + for (auto [State, Count] : enumerate(EdgeStateCounts)) + dbgs() << " " << getZAStateString(ZAState(State)) << ": " << Count; + dbgs() << "\n\n"; + }); + + State.BundleStates[I] = BundleState; + } +} + +void MachineSMEABI::insertStateChanges() { + for (MachineBasicBlock &MBB : *MF) { + const BlockInfo &Block = State.Blocks[MBB.getNumber()]; + ZAState InState = State.BundleStates[EdgeBundles->getBundle(MBB.getNumber(), + /*Out=*/false)]; + + ZAState CurrentState = Block.FixedEntryState; + if (CurrentState == ZAState::ANY) + CurrentState = InState; + + for (auto &Inst : Block.Insts) { + if (CurrentState != Inst.NeededState) + emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState, + Inst.PhysLiveRegs); + CurrentState = Inst.NeededState; + } + + if (MBB.succ_empty()) + continue; + + ZAState OutState = State.BundleStates[EdgeBundles->getBundle( + MBB.getNumber(), /*Out=*/true)]; + if (CurrentState != OutState) + emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState, + Block.PhysLiveRegsAtExit); + } +} + +TPIDR2State MachineSMEABI::getTPIDR2Block() { + if (State.TPIDR2Block) + return *State.TPIDR2Block; + MachineFrameInfo &MFI = MF->getFrameInfo(); + State.TPIDR2Block = TPIDR2State{MFI.CreateStackObject(16, Align(16), false)}; + return *State.TPIDR2Block; +} + +static DebugLoc getDebugLoc(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + if (MBBI != MBB.end()) + return MBBI->getDebugLoc(); + return DebugLoc(); +} + +void MachineSMEABI::emitSetupLazySave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + DebugLoc DL = getDebugLoc(MBB, MBBI); + + // Get pointer to TPIDR2 block. + Register TPIDR2 = MRI->createVirtualRegister(&AArch64::GPR64spRegClass); + Register TPIDR2Ptr = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2) + .addFrameIndex(getTPIDR2Block().FrameIndex) + .addImm(0) + .addImm(0); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), TPIDR2Ptr) + .addReg(TPIDR2); + // Set TPIDR2_EL0 to point to TPIDR2 block. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR)) + .addImm(AArch64SysReg::TPIDR2_EL0) + .addReg(TPIDR2Ptr); +} + +PhysRegSave MachineSMEABI::createPhysRegSave(LiveRegs PhysLiveRegs, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) { + PhysRegSave RegSave{PhysLiveRegs}; + if (PhysLiveRegs & LiveRegs::NZCV) { + RegSave.StatusFlags = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), RegSave.StatusFlags) + .addImm(AArch64SysReg::NZCV) + .addReg(AArch64::NZCV, RegState::Implicit); + } + // Note: Preserving X0 is "free" as this is before register allocation, so + // the register allocator is still able to optimize these copies. + if (PhysLiveRegs & LiveRegs::W0) { + RegSave.X0Save = MRI->createVirtualRegister(PhysLiveRegs & LiveRegs::W0_HI + ? &AArch64::GPR64RegClass + : &AArch64::GPR32RegClass); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), RegSave.X0Save) + .addReg(PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0); + } + return RegSave; +} + +void MachineSMEABI::restorePhyRegSave(PhysRegSave const &RegSave, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) { + if (RegSave.StatusFlags != AArch64::NoRegister) + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR)) + .addImm(AArch64SysReg::NZCV) + .addReg(RegSave.StatusFlags) + .addReg(AArch64::NZCV, RegState::ImplicitDefine); + + if (RegSave.X0Save != AArch64::NoRegister) + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), + RegSave.PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0) + .addReg(RegSave.X0Save); +} + +void MachineSMEABI::emitRestoreLazySave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs) { + auto *TLI = Subtarget->getTargetLowering(); + DebugLoc DL = getDebugLoc(MBB, MBBI); + Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + Register TPIDR2 = AArch64::X0; + + // TODO: Emit these within the restore MBB to prevent unnecessary saves. + PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL); + + // Enable ZA. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1)) + .addImm(AArch64SVCR::SVCRZA) + .addImm(1); + // Get current TPIDR2_EL0. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), TPIDR2EL0) + .addImm(AArch64SysReg::TPIDR2_EL0); + // Get pointer to TPIDR2 block. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2) + .addFrameIndex(getTPIDR2Block().FrameIndex) + .addImm(0) + .addImm(0); + // (Conditionally) restore ZA state. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::RestoreZAPseudo)) + .addReg(TPIDR2EL0) + .addReg(TPIDR2) + .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_RESTORE)) + .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); + // Zero TPIDR2_EL0. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR)) + .addImm(AArch64SysReg::TPIDR2_EL0) + .addReg(AArch64::XZR); + + restorePhyRegSave(RegSave, MBB, MBBI, DL); +} + +void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool ClearTPIDR2) { + DebugLoc DL = getDebugLoc(MBB, MBBI); + + if (ClearTPIDR2) + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR)) + .addImm(AArch64SysReg::TPIDR2_EL0) + .addReg(AArch64::XZR); + + // Disable ZA. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1)) + .addImm(AArch64SVCR::SVCRZA) + .addImm(0); +} + +void MachineSMEABI::emitAllocateLazySaveBuffer( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { + MachineFrameInfo &MFI = MF->getFrameInfo(); + + DebugLoc DL = getDebugLoc(MBB, MBBI); + Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + + // Calculate SVL. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1); + + // 1. Allocate the lazy save buffer. + { + // TODO This function grows the stack with a subtraction, which doesn't work + // on Windows. Some refactoring to share the functionality in + // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI + // supports SME + assert(!Subtarget->isTargetWindows() && + "Lazy ZA save is not yet supported on Windows"); + // Get original stack pointer. + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP) + .addReg(AArch64::SP); + // Allocate a lazy-save buffer object of the size given, normally SVL * SVL + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSUBXrrr), Buffer) + .addReg(SVL) + .addReg(SVL) + .addReg(SP); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::SP) + .addReg(Buffer); + // We have just allocated a variable sized object, tell this to PEI. + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + + // 2. Setup the TPIDR2 block. + { + // Note: This case just needs to do `SVL << 48`. It is not implemented as we + // generally don't support big-endian SVE/SME. + if (!Subtarget->isLittleEndian()) + reportFatalInternalError( + "TPIDR2 block initialization is not supported on big-endian targets"); + + // Store buffer pointer and num_za_save_slices. + // Bytes 10-15 are implicitly zeroed. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::STPXi)) + .addReg(Buffer) + .addReg(SVL) + .addFrameIndex(getTPIDR2Block().FrameIndex) + .addImm(0); + } +} + +void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + auto *TLI = Subtarget->getTargetLowering(); + DebugLoc DL = getDebugLoc(MBB, MBBI); + + // Get current TPIDR2_EL0. + Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS)) + .addReg(TPIDR2EL0, RegState::Define) + .addImm(AArch64SysReg::TPIDR2_EL0); + // If TPIDR2_EL0 is non-zero, commit the lazy save. + // NOTE: Functions that only use ZT0 don't need to zero ZA. + bool ZeroZA = + MF->getInfo()->getSMEFnAttrs().hasZAState(); + auto CommitZASave = + BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo)) + .addReg(TPIDR2EL0) + .addImm(ZeroZA ? 1 : 0) + .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_SAVE)) + .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); + if (ZeroZA) + CommitZASave.addDef(AArch64::ZAB0, RegState::ImplicitDefine); + // Enable ZA (as ZA could have previously been in the OFF state). + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1)) + .addImm(AArch64SVCR::SVCRZA) + .addImm(1); +} + +void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, + ZAState From, ZAState To, + LiveRegs PhysLiveRegs) { + + // ZA not used. + if (From == ZAState::ANY || To == ZAState::ANY) + return; + + // If we're exiting from the CALLER_DORMANT state that means this new ZA + // function did not touch ZA (so ZA was never turned on). + if (From == ZAState::CALLER_DORMANT && To == ZAState::OFF) + return; + + // TODO: Avoid setting up the save buffer if there's no transition to + // LOCAL_SAVED. + if (From == ZAState::CALLER_DORMANT) { + assert(MBB.getParent() + ->getInfo() + ->getSMEFnAttrs() + .hasPrivateZAInterface() && + "CALLER_DORMANT state requires private ZA interface"); + assert(&MBB == &MBB.getParent()->front() && + "CALLER_DORMANT state only valid in entry block"); + emitNewZAPrologue(MBB, MBB.getFirstNonPHI()); + if (To == ZAState::ACTIVE) + return; // Nothing more to do (ZA is active after the prologue). + + // Note: "emitNewZAPrologue" zeros ZA, so we may need to setup a lazy save + // if "To" is "ZAState::LOCAL_SAVED". It may be possible to improve this + // case by changing the placement of the zero instruction. + From = ZAState::ACTIVE; + } + + if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED) + emitSetupLazySave(MBB, InsertPt); + else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE) + emitRestoreLazySave(MBB, InsertPt, PhysLiveRegs); + else if (To == ZAState::OFF) { + assert(From != ZAState::CALLER_DORMANT && + "CALLER_DORMANT to OFF should have already been handled"); + emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED); + } else { + dbgs() << "Error: Transition from " << getZAStateString(From) << " to " + << getZAStateString(To) << '\n'; + llvm_unreachable("Unimplemented state transition"); + } +} + +} // end anonymous namespace + +INITIALIZE_PASS(MachineSMEABI, "aarch64-machine-sme-abi", "Machine SME ABI", + false, false) + +bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { + if (!MF.getSubtarget().hasSME()) + return false; + + auto *AFI = MF.getInfo(); + SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs(); + if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State()) + return false; + + assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!"); + + // Reset pass state. + State = PassState{}; + this->MF = &MF; + EdgeBundles = &getAnalysis().getEdgeBundles(); + Subtarget = &MF.getSubtarget(); + TII = Subtarget->getInstrInfo(); + TRI = Subtarget->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + collectNeededZAStates(SMEFnAttrs); + assignBundleZAStates(); + insertStateChanges(); + + // Allocate save buffer (if needed). + if (State.TPIDR2Block) { + MachineBasicBlock &EntryBlock = MF.front(); + emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI()); + } + + return true; +} + +FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); } diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index 48f9da02d3182..d26e3cd3a9f76 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -206,7 +206,7 @@ class SMECallAttrs { } bool requiresEnablingZAAfterCall() const { - return requiresLazySave() || requiresDisablingZABeforeCall(); + return requiresDisablingZABeforeCall(); } bool requiresPreservingAllZAState() const { diff --git a/llvm/test/CodeGen/AArch64/aarch64-sme-za-call-lowering.ll b/llvm/test/CodeGen/AArch64/aarch64-sme-za-call-lowering.ll new file mode 100644 index 0000000000000..eb23a0f77accf --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-sme-za-call-lowering.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sme,+sve -aarch64-new-sme-abi -stop-before=aarch64-machine-sme-abi -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-BEFORE-SMEABI +; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sme,+sve -aarch64-new-sme-abi -stop-after=aarch64-machine-sme-abi -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-AFTER-SMEABI + +declare void @private_za_callee() +declare void @shared_za_callee() "aarch64_inout_za" +declare i64 @shared_za_callee_many_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) "aarch64_inout_za" + +; Tests a "RequiresZASavePseudo" is placed after the "ADJCALLSTACKDOWN" for private ZA callees. +define void @requires_za_save() nounwind "aarch64_inout_za" { + ; CHECK-BEFORE-SMEABI-LABEL: name: requires_za_save + ; CHECK-BEFORE-SMEABI: bb.0 (%ir-block.0): + ; CHECK-BEFORE-SMEABI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-SMEABI-NEXT: RequiresZASavePseudo + ; CHECK-BEFORE-SMEABI-NEXT: BL @private_za_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-BEFORE-SMEABI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-SMEABI-NEXT: RET_ReallyLR + ; + ; CHECK-AFTER-SMEABI-LABEL: name: requires_za_save + ; CHECK-AFTER-SMEABI: bb.0 (%ir-block.0): + ; CHECK-AFTER-SMEABI-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg + ; CHECK-AFTER-SMEABI-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $sp + ; CHECK-AFTER-SMEABI-NEXT: [[MSUBXrrr:%[0-9]+]]:gpr64 = MSUBXrrr [[RDSVLI_XI]], [[RDSVLI_XI]], [[COPY]] + ; CHECK-AFTER-SMEABI-NEXT: $sp = COPY [[MSUBXrrr]] + ; CHECK-AFTER-SMEABI-NEXT: STPXi [[MSUBXrrr]], [[RDSVLI_XI]], %stack.0, 0 + ; CHECK-AFTER-SMEABI-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0 + ; CHECK-AFTER-SMEABI-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY [[ADDXri]] + ; CHECK-AFTER-SMEABI-NEXT: MSR 56965, [[COPY1]] + ; CHECK-AFTER-SMEABI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-SMEABI-NEXT: RequiresZASavePseudo + ; CHECK-AFTER-SMEABI-NEXT: BL @private_za_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-AFTER-SMEABI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-SMEABI-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv + ; CHECK-AFTER-SMEABI-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv + ; CHECK-AFTER-SMEABI-NEXT: $x0 = ADDXri %stack.0, 0, 0 + ; CHECK-AFTER-SMEABI-NEXT: RestoreZAPseudo [[MRS]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0 + ; CHECK-AFTER-SMEABI-NEXT: MSR 56965, $xzr + ; CHECK-AFTER-SMEABI-NEXT: RET_ReallyLR + call void @private_za_callee() + ret void +} + +; Tests ZA state markers like "RequiresZASavePseudo" are placed before any streaming mode changes. +define void @requires_za_save_streaming_mode_change() nounwind "aarch64_inout_za" "aarch64_pstate_sm_enabled" { + ; CHECK-BEFORE-SMEABI-LABEL: name: requires_za_save_streaming_mode_change + ; CHECK-BEFORE-SMEABI: bb.0 (%ir-block.0): + ; CHECK-BEFORE-SMEABI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-SMEABI-NEXT: RequiresZASavePseudo + ; CHECK-BEFORE-SMEABI-NEXT: VGSavePseudo + ; CHECK-BEFORE-SMEABI-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-SMEABI-NEXT: BL @private_za_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-BEFORE-SMEABI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-SMEABI-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-SMEABI-NEXT: VGRestorePseudo + ; CHECK-BEFORE-SMEABI-NEXT: RET_ReallyLR + ; + ; CHECK-AFTER-SMEABI-LABEL: name: requires_za_save_streaming_mode_change + ; CHECK-AFTER-SMEABI: bb.0 (%ir-block.0): + ; CHECK-AFTER-SMEABI-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg + ; CHECK-AFTER-SMEABI-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $sp + ; CHECK-AFTER-SMEABI-NEXT: [[MSUBXrrr:%[0-9]+]]:gpr64 = MSUBXrrr [[RDSVLI_XI]], [[RDSVLI_XI]], [[COPY]] + ; CHECK-AFTER-SMEABI-NEXT: $sp = COPY [[MSUBXrrr]] + ; CHECK-AFTER-SMEABI-NEXT: STPXi [[MSUBXrrr]], [[RDSVLI_XI]], %stack.0, 0 + ; CHECK-AFTER-SMEABI-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0 + ; CHECK-AFTER-SMEABI-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY [[ADDXri]] + ; CHECK-AFTER-SMEABI-NEXT: MSR 56965, [[COPY1]] + ; CHECK-AFTER-SMEABI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-SMEABI-NEXT: RequiresZASavePseudo + ; CHECK-AFTER-SMEABI-NEXT: VGSavePseudo + ; CHECK-AFTER-SMEABI-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-SMEABI-NEXT: BL @private_za_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-AFTER-SMEABI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-SMEABI-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-SMEABI-NEXT: VGRestorePseudo + ; CHECK-AFTER-SMEABI-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv + ; CHECK-AFTER-SMEABI-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv + ; CHECK-AFTER-SMEABI-NEXT: $x0 = ADDXri %stack.0, 0, 0 + ; CHECK-AFTER-SMEABI-NEXT: RestoreZAPseudo [[MRS]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0 + ; CHECK-AFTER-SMEABI-NEXT: MSR 56965, $xzr + ; CHECK-AFTER-SMEABI-NEXT: RET_ReallyLR + call void @private_za_callee() + ret void +} + +; Tests "InOutZAUsePseudo" is placed after the "ADJCALLSTACKDOWN" for shared ZA callees. +define void @inout_za_call() nounwind "aarch64_inout_za" { + ; CHECK-COMMON-LABEL: name: inout_za_call + ; CHECK-COMMON: bb.0 (%ir-block.0): + ; CHECK-COMMON-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-COMMON-NEXT: InOutZAUsePseudo + ; CHECK-COMMON-NEXT: BL @shared_za_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-COMMON-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-COMMON-NEXT: RET_ReallyLR + call void @shared_za_callee() + ret void +} + +; Tests ZA state markers like "InOutZAUsePseudo" are placed before any stack argument setup. +define void @many_args_inout_za_call(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9) "aarch64_inout_za" { + ; CHECK-COMMON-LABEL: name: many_args_inout_za_call + ; CHECK-COMMON: bb.0 (%ir-block.10): + ; CHECK-COMMON-NEXT: liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7 + ; CHECK-COMMON-NEXT: {{ $}} + ; CHECK-COMMON-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x7 + ; CHECK-COMMON-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x6 + ; CHECK-COMMON-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x5 + ; CHECK-COMMON-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY $x4 + ; CHECK-COMMON-NEXT: [[COPY4:%[0-9]+]]:gpr64 = COPY $x3 + ; CHECK-COMMON-NEXT: [[COPY5:%[0-9]+]]:gpr64 = COPY $x2 + ; CHECK-COMMON-NEXT: [[COPY6:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-COMMON-NEXT: [[COPY7:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-COMMON-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui %fixed-stack.1, 0 :: (load (s64) from %fixed-stack.1, align 16) + ; CHECK-COMMON-NEXT: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui %fixed-stack.0, 0 :: (load (s64) from %fixed-stack.0) + ; CHECK-COMMON-NEXT: ADJCALLSTACKDOWN 16, 0, implicit-def dead $sp, implicit $sp + ; CHECK-COMMON-NEXT: InOutZAUsePseudo + ; CHECK-COMMON-NEXT: [[COPY8:%[0-9]+]]:gpr64sp = COPY $sp + ; CHECK-COMMON-NEXT: STRXui killed [[LDRXui1]], [[COPY8]], 1 :: (store (s64) into stack + 8) + ; CHECK-COMMON-NEXT: STRXui killed [[LDRXui]], [[COPY8]], 0 :: (store (s64) into stack) + ; CHECK-COMMON-NEXT: $x0 = COPY [[COPY7]] + ; CHECK-COMMON-NEXT: $x1 = COPY [[COPY6]] + ; CHECK-COMMON-NEXT: $x2 = COPY [[COPY5]] + ; CHECK-COMMON-NEXT: $x3 = COPY [[COPY4]] + ; CHECK-COMMON-NEXT: $x4 = COPY [[COPY3]] + ; CHECK-COMMON-NEXT: $x5 = COPY [[COPY2]] + ; CHECK-COMMON-NEXT: $x6 = COPY [[COPY1]] + ; CHECK-COMMON-NEXT: $x7 = COPY [[COPY]] + ; CHECK-COMMON-NEXT: BL @shared_za_callee_many_args, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit-def $sp, implicit-def $x0 + ; CHECK-COMMON-NEXT: ADJCALLSTACKUP 16, 0, implicit-def dead $sp, implicit $sp + ; CHECK-COMMON-NEXT: [[COPY9:%[0-9]+]]:gpr64all = COPY $x0 + ; CHECK-COMMON-NEXT: RET_ReallyLR + %ret = call i64 @shared_za_callee_many_args(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9) + ret void +} diff --git a/llvm/test/CodeGen/AArch64/expand-sme-pseudos.mir b/llvm/test/CodeGen/AArch64/expand-sme-pseudos.mir new file mode 100644 index 0000000000000..6ca9b9b6cb200 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/expand-sme-pseudos.mir @@ -0,0 +1,101 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -run-pass=aarch64-expand-pseudo -mtriple=aarch64-unknown-linux-gnu -o - %s | FileCheck %s + +--- + +# X0 = TPIDR2 block pointer +# X8 = TPIDR2_EL0 +name: restore_za_save +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: restore_za_save + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x0 = IMPLICIT_DEF + ; CHECK-NEXT: $x8 = MRS 56965, implicit-def $nzcv + ; CHECK-NEXT: CBZX $x8, %bb.1 + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $x8, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BL &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $lr, implicit $sp, implicit $x0 + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: RET undef $lr + $x0 = IMPLICIT_DEF + $x8 = MRS 56965, implicit-def $nzcv + + RestoreZAPseudo $x8, $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0 + + RET_ReallyLR + +... +--- + +# X8 = TPIDR2_EL0 +name: commit_za_save +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: commit_za_save + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x8 = MRS 56965, implicit-def $nzcv + ; CHECK-NEXT: CBNZX $x8, %bb.1 + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $x8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BL &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $lr, implicit $sp + ; CHECK-NEXT: MSR 56965, $xzr + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: RET undef $lr + $x8 = MRS 56965, implicit-def $nzcv + + CommitZASavePseudo $x8, 0, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0 + + RET_ReallyLR + +... +--- +# X8 = TPIDR2_EL0 +name: commit_za_save_zero_za +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: commit_za_save_zero_za + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x8 = MRS 56965, implicit-def $nzcv + ; CHECK-NEXT: CBNZX $x8, %bb.1 + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $x8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BL &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $lr, implicit $sp, implicit-def $zab0 + ; CHECK-NEXT: MSR 56965, $xzr + ; CHECK-NEXT: ZERO_M 255, implicit-def $zab0 + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: RET undef $lr + $x8 = MRS 56965, implicit-def $nzcv + + CommitZASavePseudo $x8, 1, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $zab0 + + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index ba40ccd1c7406..3579baae1d7d8 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mattr=+sme2 < %s | FileCheck %s +; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s target triple = "aarch64" @@ -198,3 +199,41 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( %res2 = call i64 @private_za_decl(i64 %res) ret i64 %res2 } + +declare i64 @many_args_private_za_callee( + i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) + +; In this example some arguments are passed on the stack, which decrements the +; stack pointer before the call -- in this test the call to __arm_sme_save +; should occur _before_ the stack decrement. +define i64 @test_many_callee_arguments( +; CHECK-LABEL: test_many_callee_arguments: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: bl __arm_sme_state_size +; CHECK-NEXT: sub sp, sp, x0 +; CHECK-NEXT: ldp x9, x10, [x29, #32] +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: stp x9, x10, [sp, #-16]! +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: bl many_args_private_za_callee +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9 +) nounwind "aarch64_za_state_agnostic" { + %ret = call i64 @many_args_private_za_callee( + i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9) + ret i64 %ret +} diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 3f5e7e9f32a47..e623d3fb075f7 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -1,7 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK +; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING declare void @private_za_callee() +declare void @shared_za_callee() "aarch64_inout_za" +declare void @preserves_za_callee() "aarch64_preserves_za" + declare float @llvm.cos.f32(float) ; Test lazy-save mechanism for a single callee. @@ -35,6 +39,33 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: test_lazy_save_1_callee: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB0_2: +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret call void @private_za_callee() ret void } @@ -83,6 +114,34 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: test_lazy_save_2_callees: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB1_2: +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret call void @private_za_callee() call void @private_za_callee() ret void @@ -119,6 +178,33 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: test_lazy_save_expanded_intrinsic: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEWLOWERING-NEXT: bl cosf +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB2_2: +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret %res = call float @llvm.cos.f32(float %a) ret float %res } @@ -174,6 +260,547 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: test_lazy_save_and_conditional_smstart: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: cntd x9 +; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: add x29, sp, #64 +; CHECK-NEWLOWERING-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state +; CHECK-NEWLOWERING-NEXT: mov x20, x0 +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #80 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB3_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: smstop sm +; CHECK-NEWLOWERING-NEXT: .LBB3_2: +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB3_4 +; CHECK-NEWLOWERING-NEXT: // %bb.3: +; CHECK-NEWLOWERING-NEXT: smstart sm +; CHECK-NEWLOWERING-NEXT: .LBB3_4: +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #80 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB3_6 +; CHECK-NEWLOWERING-NEXT: // %bb.5: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB3_6: +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 +; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret + call void @private_za_callee() + ret void +} + +; Note: For the final @private_za_callee() we setup a lazy save then don't +; restore from it (since ZA is off on return). We could improve this case +; by turning ZA off before the final private ZA call. +define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" +; CHECK-LABEL: test_lazy_save_mixed_shared_and_private_callees: +; CHECK: // %bb.0: // %prelude +; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -32 +; CHECK-NEXT: .cfi_offset w30, -40 +; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbz x8, .LBB4_2 +; CHECK-NEXT: // %bb.1: // %save.za +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: smstart za +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: zero {za} +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: bl private_za_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB4_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB4_4: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: bl shared_za_callee +; CHECK-NEXT: bl preserves_za_callee +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: bl private_za_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB4_6 +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB4_6: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: test_lazy_save_mixed_shared_and_private_callees: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w19, -16 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -24 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -32 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB4_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: zero {za} +; CHECK-NEWLOWERING-NEXT: .LBB4_2: +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB4_4 +; CHECK-NEWLOWERING-NEXT: // %bb.3: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB4_4: +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: bl shared_za_callee +; CHECK-NEWLOWERING-NEXT: bl preserves_za_callee +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: smstop za +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret +{ + call void @private_za_callee() + call void @shared_za_callee() + call void @preserves_za_callee() + call void @private_za_callee() + ret void +} + +define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { +; CHECK-LABEL: test_many_back2back_private_za_calls: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -32 +; CHECK-NEXT: .cfi_offset w30, -40 +; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: msub x8, x20, x20, x8 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: bl shared_za_callee +; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: bl private_za_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB5_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_2: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: bl private_za_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB5_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_4: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: bl private_za_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB5_6 +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_6: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: bl private_za_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB5_8 +; CHECK-NEXT: // %bb.7: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_8: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: bl private_za_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB5_10 +; CHECK-NEXT: // %bb.9: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_10: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: bl private_za_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB5_12 +; CHECK-NEXT: // %bb.11: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_12: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: bl shared_za_callee +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: test_many_back2back_private_za_calls: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w19, -16 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -24 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -32 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: bl shared_za_callee +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB5_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB5_2: +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: bl shared_za_callee +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret + call void @shared_za_callee() + call void @private_za_callee() + call void @private_za_callee() + call void @private_za_callee() + call void @private_za_callee() + call void @private_za_callee() + call void @private_za_callee() + call void @shared_za_callee() + ret void +} + +define void @test_shared_private_shared() nounwind "aarch64_inout_za" { +; CHECK-LABEL: test_shared_private_shared: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: msub x8, x20, x20, x8 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: bl shared_za_callee +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl private_za_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB6_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: bl shared_za_callee +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: test_shared_private_shared: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: bl shared_za_callee +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB6_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB6_2: +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: bl shared_za_callee +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret + call void @shared_za_callee() call void @private_za_callee() + call void @shared_za_callee() ret void } + +define void @test_only_shared_za() nounwind "aarch64_inout_za" { +; CHECK-COMMON-LABEL: test_only_shared_za: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: bl shared_za_callee +; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret + call void @shared_za_callee() + ret void +} + +declare i64 @shared_za_callee_i64(i64) "aarch64_inout_za" +declare i64 @private_za_callee_i64(i64) + +define i64 @test_shared_private_shared_i64(i64 %x) nounwind "aarch64_inout_za" { +; CHECK-LABEL: test_shared_private_shared_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: msub x8, x20, x20, x8 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: bl shared_za_callee_i64 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl private_za_callee_i64 +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB8_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB8_2: +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: bl shared_za_callee_i64 +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: test_shared_private_shared_i64: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: bl shared_za_callee_i64 +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_callee_i64 +; CHECK-NEWLOWERING-NEXT: mov x1, x0 +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB8_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB8_2: +; CHECK-NEWLOWERING-NEXT: mov x0, x1 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: bl shared_za_callee_i64 +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret + %a = call i64 @shared_za_callee_i64(i64 %x) + %b = call i64 @private_za_callee_i64(i64 %a) + %c = call i64 @shared_za_callee_i64(i64 %b) + ret i64 %c +} + +declare i64 @many_args_private_za_callee( + i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) + +; In this example some arguments are passed on the stack, which decrements the +; stack pointer before the call -- in this test the lazy save should be setup +; before the stack decrement. +define i64 @test_many_callee_arguments( +; CHECK-LABEL: test_many_callee_arguments: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: ldp x10, x11, [x29, #32] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh w9, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: stp x10, x11, [sp, #-16]! +; CHECK-NEXT: bl many_args_private_za_callee +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB9_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB9_2: +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: test_many_callee_arguments: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: ldp x10, x11, [x29, #32] +; CHECK-NEWLOWERING-NEXT: sub x12, x29, #16 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x12 +; CHECK-NEWLOWERING-NEXT: stp x10, x11, [sp, #-16]! +; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee +; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x9, .LBB9_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB9_2: +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret + i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9 +) nounwind "aarch64_inout_za" { + %ret = call i64 @many_args_private_za_callee( + i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9) + ret i64 %ret +} diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-sve-nzcv-live.mir b/llvm/test/CodeGen/AArch64/sme-lazy-sve-nzcv-live.mir new file mode 100644 index 0000000000000..18764d508d0fa --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-lazy-sve-nzcv-live.mir @@ -0,0 +1,132 @@ +# RUN: llc -mtriple=aarch64 -mattr=+sve -mattr=+sme -run-pass=aarch64-machine-sme-abi -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64 -mattr=+sve -mattr=+sme -aarch64-new-sme-abi %s -o - | FileCheck %s --check-prefix=CHECK-ASM + +# This tests the unfortunate case the status flags ($nzcv) are live at the point +# we want to restore ZA. Currently, this is handled by saving them to a scratch +# register. + +--- | + define void @cmp_branch(i32 %0) "aarch64_inout_za" { + tail call void @clobber() + %2 = icmp sgt i32 %0, 100 + br i1 %2, label %3, label %4 + + 3: ; preds = %1 + tail call void @inout_call() #3 + br label %4 + + 4: ; preds = %3, %1 + tail call void @inout_call() #3 + ret void + } + + declare void @clobber() + declare void @inout_call() "aarch64_inout_za" +... +--- +name: cmp_branch +alignment: 4 +tracksRegLiveness: true +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +registers: + - { id: 0, class: gpr32common } + - { id: 1, class: gpr32 } +liveins: + - { reg: '$w0' } +frameInfo: + maxAlignment: 1 + adjustsStack: true + hasCalls: true + maxCallFrameSize: 0 + hasTailCall: true +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: cmp_branch + ; CHECK: bb.0 (%ir-block.1): + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $sp + ; CHECK-NEXT: [[MSUBXrrr:%[0-9]+]]:gpr64 = MSUBXrrr [[RDSVLI_XI]], [[RDSVLI_XI]], [[COPY]] + ; CHECK-NEXT: $sp = COPY [[MSUBXrrr]] + ; CHECK-NEXT: STPXi [[MSUBXrrr]], [[RDSVLI_XI]], %stack.0, 0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32common = COPY $w0 + ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXri]] + ; CHECK-NEXT: MSR 56965, [[COPY2]] + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: RequiresZASavePseudo + ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri [[COPY1]], 101, 0, implicit-def $nzcv + ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 55824, implicit-def $nzcv, implicit $nzcv + ; CHECK-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv + ; CHECK-NEXT: [[MRS1:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv + ; CHECK-NEXT: $x0 = ADDXri %stack.0, 0, 0 + ; CHECK-NEXT: RestoreZAPseudo [[MRS1]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0 + ; CHECK-NEXT: MSR 56965, $xzr + ; CHECK-NEXT: MSR 55824, [[MRS]], implicit-def $nzcv + ; CHECK-NEXT: Bcc 11, %bb.2, implicit $nzcv + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1 (%ir-block.3): + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: InOutZAUsePseudo + ; CHECK-NEXT: BL @inout_call, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2 (%ir-block.4): + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: InOutZAUsePseudo + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp + ; CHECK-NEXT: TCRETURNdi @inout_call, 0, csr_aarch64_aapcs, implicit $sp + + bb.0 (%ir-block.1): + successors: %bb.1, %bb.2 + liveins: $w0 + + %0:gpr32common = COPY $w0 + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + RequiresZASavePseudo + BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + %1:gpr32 = SUBSWri %0, 101, 0, implicit-def $nzcv + Bcc 11, %bb.2, implicit $nzcv + B %bb.1 + + bb.1 (%ir-block.3): + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + InOutZAUsePseudo + BL @inout_call, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + + bb.2 (%ir-block.4): + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + InOutZAUsePseudo + ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp + TCRETURNdi @inout_call, 0, csr_aarch64_aapcs, implicit $sp +... + +# CHECK-ASM-LABEL: cmp_branch +# CHECK-ASM: msr TPIDR2_EL0, x10 +# CHECK-ASM-NEXT: bl clobber +# CHECK-ASM-NEXT: cmp w20, #101 +# CHECK-ASM-NEXT: mrs x8, NZCV +# CHECK-ASM-NEXT: smstart za +# CHECK-ASM-NEXT: mrs x9, TPIDR2_EL0 +# CHECK-ASM-NEXT: sub x0, x29, #16 +# CHECK-ASM-NEXT: cbnz x9, .LBB0_2 +# CHECK-ASM: bl __arm_tpidr2_restore +# CHECK-ASM-NEXT: .LBB0_2: +# CHECK-ASM-NEXT: msr TPIDR2_EL0, xzr +# CHECK-ASM-NEXT: msr NZCV, x8 +# CHECK-ASM-NEXT: b.lt .LBB0_4 +# CHECK-ASM: bl inout_call +# CHECK-ASM-NEXT: .LBB0_4: +# CHECK-ASM: b inout_call diff --git a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll index 04d26902c536a..78f7e5c009288 100644 --- a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll +++ b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll @@ -1,53 +1,133 @@ -; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi %s | FileCheck %s -; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi -aarch64-sme-abi %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING declare void @shared_za_callee() "aarch64_inout_za" define void @private_za() "aarch64_new_za" { -; CHECK-LABEL: @private_za( -; CHECK-NEXT: prelude: -; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() -; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[TPIDR2]], 0 -; CHECK-NEXT: br i1 [[CMP]], label [[SAVE_ZA:%.*]], label [[TMP0:%.*]] -; CHECK: save.za: -; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() -; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) -; CHECK-NEXT: br label [[TMP0]] -; CHECK: 0: -; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() -; CHECK-NEXT: call void @llvm.aarch64.sme.zero(i32 255) -; CHECK-NEXT: call void @shared_za_callee() -; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() -; CHECK-NEXT: ret void +; CHECK-LABEL: private_za: +; CHECK: // %bb.0: // %prelude +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbz x8, .LBB0_2 +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_1: // %save.za +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: smstart za +; CHECK-NEXT: zero {za} +; CHECK-NEXT: bl shared_za_callee +; CHECK-NEXT: smstop za +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret ; +; CHECK-NEWLOWERING-LABEL: private_za: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -16 +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_1 +; CHECK-NEWLOWERING-NEXT: b .LBB0_2 +; CHECK-NEWLOWERING-NEXT: .LBB0_1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: zero {za} +; CHECK-NEWLOWERING-NEXT: b .LBB0_2 +; CHECK-NEWLOWERING-NEXT: .LBB0_2: +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: bl shared_za_callee +; CHECK-NEWLOWERING-NEXT: smstop za +; CHECK-NEWLOWERING-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret call void @shared_za_callee() ret void } +; Note: This test must run at -O0 as otherwise the multiple exits are optimized out. +; TODO: We should be able to omit the ZA save here (as this function does not use ZA). define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_new_za" { -; CHECK-LABEL: @private_za_multiple_exit( -; CHECK-NEXT: prelude: -; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() -; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[TPIDR2]], 0 -; CHECK-NEXT: br i1 [[CMP]], label [[SAVE_ZA:%.*]], label [[ENTRY:%.*]] -; CHECK: save.za: -; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() -; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) -; CHECK-NEXT: br label [[ENTRY]] -; CHECK: entry: -; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() -; CHECK-NEXT: call void @llvm.aarch64.sme.zero(i32 255) -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[COND:%.*]], 1 -; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_ELSE:%.*]], label [[IF_END:%.*]] -; CHECK: if.else: -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() -; CHECK-NEXT: ret i32 [[ADD]] -; CHECK: if.end: -; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[A]], [[B]] -; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() -; CHECK-NEXT: ret i32 [[SUB]] +; CHECK-LABEL: private_za_multiple_exit: +; CHECK: // %bb.0: // %prelude +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: str x2, [sp] // 8-byte Folded Spill +; CHECK-NEXT: str w1, [sp, #8] // 4-byte Folded Spill +; CHECK-NEXT: str w0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbz x8, .LBB1_2 +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_1: // %save.za +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: b .LBB1_2 +; CHECK-NEXT: .LBB1_2: // %entry +; CHECK-NEXT: ldr x8, [sp] // 8-byte Folded Reload +; CHECK-NEXT: smstart za +; CHECK-NEXT: zero {za} +; CHECK-NEXT: subs x8, x8, #1 +; CHECK-NEXT: b.ne .LBB1_4 +; CHECK-NEXT: b .LBB1_3 +; CHECK-NEXT: .LBB1_3: // %if.else +; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: smstop za +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_4: // %if.end +; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; CHECK-NEXT: subs w0, w8, w9 +; CHECK-NEXT: smstop za +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret ; +; CHECK-NEWLOWERING-LABEL: private_za_multiple_exit: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_1 +; CHECK-NEWLOWERING-NEXT: b .LBB1_2 +; CHECK-NEWLOWERING-NEXT: .LBB1_1: // %entry +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: zero {za} +; CHECK-NEWLOWERING-NEXT: b .LBB1_2 +; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %entry +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: str w1, [sp, #8] // 4-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str w0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: subs x8, x2, #1 +; CHECK-NEWLOWERING-NEXT: b.ne .LBB1_4 +; CHECK-NEWLOWERING-NEXT: b .LBB1_3 +; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %if.else +; CHECK-NEWLOWERING-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: add w0, w8, w9 +; CHECK-NEWLOWERING-NEXT: smstop za +; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: ret +; CHECK-NEWLOWERING-NEXT: .LBB1_4: // %if.end +; CHECK-NEWLOWERING-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: subs w0, w8, w9 +; CHECK-NEWLOWERING-NEXT: smstop za +; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: ret entry: %tobool = icmp eq i64 %cond, 1 br i1 %tobool, label %if.else, label %if.end @@ -61,5 +141,38 @@ if.end: ret i32 %sub } -; CHECK: declare void @__arm_tpidr2_save() #[[ATTR:[0-9]+]] -; CHECK: attributes #[[ATTR]] = { "aarch64_pstate_sm_compatible" } +; In simple cases like this we should omit all ZA setup. +define i32 @private_za_trivially_does_not_use_za(i32 %x) "aarch64_new_za" { +; CHECK-LABEL: private_za_trivially_does_not_use_za: +; CHECK: // %bb.0: // %prelude +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: str w0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbz x8, .LBB2_2 +; CHECK-NEXT: b .LBB2_1 +; CHECK-NEXT: .LBB2_1: // %save.za +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: b .LBB2_2 +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: smstart za +; CHECK-NEXT: zero {za} +; CHECK-NEXT: add w0, w8, w8 +; CHECK-NEXT: smstop za +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: private_za_trivially_does_not_use_za: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: add w0, w0, w0 +; CHECK-NEWLOWERING-NEXT: ret + %ret = add i32 %x, %x + ret i32 %ret +} diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index 393ff3b79aedf..b4ff8d085ff40 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING declare void @private_za_callee() @@ -32,6 +33,31 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: disable_tailcallopt: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEWLOWERING-NEXT: bl private_za_callee +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB0_2: +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret tail call void @private_za_callee() ret void } @@ -65,6 +91,31 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: f128_call_za: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEWLOWERING-NEXT: bl __addtf3 +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB1_2: +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret %res = fadd fp128 %a, %b ret fp128 %res } diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll new file mode 100644 index 0000000000000..d3d7e953bedfa --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll @@ -0,0 +1,1131 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING + +declare void @private_za_call() +declare void @shared_za_call() "aarch64_inout_za" + +define void @private_za_loop(i32 %n) "aarch64_inout_za" nounwind { +; CHECK-LABEL: private_za_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: cmp w0, #1 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: b.lt .LBB0_5 +; CHECK-NEXT: // %bb.1: // %loop.preheader +; CHECK-NEXT: mov w19, w0 +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: b .LBB0_3 +; CHECK-NEXT: .LBB0_2: // %loop +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b.eq .LBB0_5 +; CHECK-NEXT: .LBB0_3: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB0_2 +; CHECK-NEXT: // %bb.4: // %loop +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .LBB0_5: // %exit +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: private_za_loop: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 +; CHECK-NEWLOWERING-NEXT: cmp w0, #1 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEWLOWERING-NEXT: b.lt .LBB0_3 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader +; CHECK-NEWLOWERING-NEXT: mov w19, w0 +; CHECK-NEWLOWERING-NEXT: .LBB0_2: // %loop +; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: subs w19, w19, #1 +; CHECK-NEWLOWERING-NEXT: b.ne .LBB0_2 +; CHECK-NEWLOWERING-NEXT: .LBB0_3: // %exit +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_5 +; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB0_5: // %exit +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret +entry: + %cmpgt = icmp sgt i32 %n, 0 + br i1 %cmpgt, label %loop, label %exit + +loop: + %iv = phi i32 [ %next_iv, %loop ], [ 0, %entry ] + tail call void @private_za_call() + %next_iv = add nuw nsw i32 %iv, 1 + %cmpeq = icmp eq i32 %next_iv, %n + br i1 %cmpeq, label %exit, label %loop + +exit: + ret void +} + +; FIXME: In the new lowering we could weight edges to avoid doing the lazy save in the loop. +define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind { +; CHECK-LABEL: private_za_loop_active_entry_and_exit: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mov w19, w0 +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: cmp w19, #1 +; CHECK-NEXT: b.lt .LBB1_5 +; CHECK-NEXT: // %bb.1: // %loop.preheader +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: b .LBB1_3 +; CHECK-NEXT: .LBB1_2: // %loop +; CHECK-NEXT: // in Loop: Header=BB1_3 Depth=1 +; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b.eq .LBB1_5 +; CHECK-NEXT: .LBB1_3: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB1_2 +; CHECK-NEXT: // %bb.4: // %loop +; CHECK-NEXT: // in Loop: Header=BB1_3 Depth=1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: b .LBB1_2 +; CHECK-NEXT: .LBB1_5: // %exit +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: b shared_za_call +; +; CHECK-NEWLOWERING-LABEL: private_za_loop_active_entry_and_exit: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: mov w19, w0 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: bl shared_za_call +; CHECK-NEWLOWERING-NEXT: cmp w19, #1 +; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_5 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader +; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16 +; CHECK-NEWLOWERING-NEXT: b .LBB1_3 +; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %loop +; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: cbz w19, .LBB1_5 +; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %loop +; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: sub w19, w19, #1 +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2 +; CHECK-NEWLOWERING-NEXT: // %bb.4: // %loop +; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1 +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: b .LBB1_2 +; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: b shared_za_call +entry: + %cmpgt = icmp sgt i32 %n, 0 + tail call void @shared_za_call() + br i1 %cmpgt, label %loop, label %exit + +loop: + %iv = phi i32 [ %next_iv, %loop ], [ 0, %entry ] + tail call void @private_za_call() + %next_iv = add nuw nsw i32 %iv, 1 + %cmpeq = icmp eq i32 %next_iv, %n + br i1 %cmpeq, label %exit, label %loop + +exit: + tail call void @shared_za_call() + ret void +} + +define void @shared_za_loop(i32 %n) "aarch64_inout_za" nounwind { +; CHECK-COMMON-LABEL: shared_za_loop: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: cmp w0, #1 +; CHECK-COMMON-NEXT: b.lt .LBB2_4 +; CHECK-COMMON-NEXT: // %bb.1: // %loop.preheader +; CHECK-COMMON-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov w19, w0 +; CHECK-COMMON-NEXT: .LBB2_2: // %loop +; CHECK-COMMON-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-COMMON-NEXT: bl shared_za_call +; CHECK-COMMON-NEXT: subs w19, w19, #1 +; CHECK-COMMON-NEXT: b.ne .LBB2_2 +; CHECK-COMMON-NEXT: // %bb.3: +; CHECK-COMMON-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: .LBB2_4: // %exit +; CHECK-COMMON-NEXT: ret +entry: + %cmpgt = icmp sgt i32 %n, 0 + br i1 %cmpgt, label %loop, label %exit + +loop: + %iv = phi i32 [ %next_iv, %loop ], [ 0, %entry ] + tail call void @shared_za_call() + %next_iv = add nuw nsw i32 %iv, 1 + %cmpeq = icmp eq i32 %next_iv, %n + br i1 %cmpeq, label %exit, label %loop + +exit: + ret void +} + +; FIXME: The codegen for this case could be improved (by tuning weights). +; Here the ZA save has been hoisted out of the conditional, but would be better +; to sink it. +define void @cond_private_za_call(i1 %cond) "aarch64_inout_za" nounwind { +; CHECK-LABEL: cond_private_za_call: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: tbz w0, #0, .LBB3_4 +; CHECK-NEXT: // %bb.1: // %private_za_call +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB3_3 +; CHECK-NEXT: // %bb.2: // %private_za_call +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB3_3: // %private_za_call +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB3_4: // %exit +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: b shared_za_call +; +; CHECK-NEWLOWERING-LABEL: cond_private_za_call: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB3_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %private_za_call +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: .LBB3_2: // %exit +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB3_4 +; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB3_4: // %exit +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: b shared_za_call + br i1 %cond, label %private_za_call, label %exit + +private_za_call: + tail call void @private_za_call() + br label %exit + +exit: + tail call void @shared_za_call() + ret void +} + +define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind { +; CHECK-LABEL: mixed_shared_private_za_loop: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: msub x8, x20, x20, x8 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: b .LBB4_2 +; CHECK-NEXT: .LBB4_1: // %loop +; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: ldrb w8, [x19] +; CHECK-NEXT: tbz w8, #0, .LBB4_4 +; CHECK-NEXT: .LBB4_2: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB4_1 +; CHECK-NEXT: // %bb.3: // %loop +; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: b .LBB4_1 +; CHECK-NEXT: .LBB4_4: // %exit +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: mixed_shared_private_za_loop: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: mov x19, x0 +; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: b .LBB4_2 +; CHECK-NEWLOWERING-NEXT: .LBB4_1: // %loop +; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: tbz w8, #0, .LBB4_4 +; CHECK-NEWLOWERING-NEXT: .LBB4_2: // %loop +; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-NEXT: bl shared_za_call +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: ldrb w8, [x19] +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x9, .LBB4_1 +; CHECK-NEWLOWERING-NEXT: // %bb.3: // %loop +; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: b .LBB4_1 +; CHECK-NEWLOWERING-NEXT: .LBB4_4: // %exit +; CHECK-NEWLOWERING-NEXT: bl shared_za_call +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret + br label %loop + +loop: + call void @shared_za_call() + call void @private_za_call() + br label %latch + +latch: + %bool = load volatile i8, ptr %cond, align 1 + %trunc = trunc i8 %bool to i1 + br i1 %trunc, label %loop, label %exit + +exit: + call void @shared_za_call() + ret void +} + + +define void @cond_clobber_followed_by_clobber(i1 %cond) "aarch64_inout_za" nounwind { +; CHECK-LABEL: cond_clobber_followed_by_clobber: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mov w19, w0 +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: tbz w19, #0, .LBB5_4 +; CHECK-NEXT: // %bb.1: // %cond_clobber +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB5_3 +; CHECK-NEXT: // %bb.2: // %cond_clobber +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_3: // %cond_clobber +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB5_4: // %exit +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB5_6 +; CHECK-NEXT: // %bb.5: // %exit +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_6: // %exit +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: b shared_za_call +; +; CHECK-NEWLOWERING-LABEL: cond_clobber_followed_by_clobber: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: mov w19, w0 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: bl shared_za_call +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB5_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %cond_clobber +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: .LBB5_2: // %exit +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB5_4 +; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB5_4: // %exit +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: b shared_za_call + tail call void @shared_za_call() + br i1 %cond, label %cond_clobber, label %exit + +cond_clobber: + tail call void @private_za_call() + br label %exit + +exit: + tail call void @private_za_call() + tail call void @shared_za_call() + ret void +} + +define void @conditionally_use_za(i1 %cond) "aarch64_inout_za" nounwind { +; CHECK-LABEL: conditionally_use_za: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: tbz w0, #0, .LBB6_4 +; CHECK-NEXT: // %bb.1: // %use_za +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB6_3 +; CHECK-NEXT: // %bb.2: // %use_za +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_3: // %use_za +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB6_4: // %exit +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: conditionally_use_za: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB6_4 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_za +; CHECK-NEWLOWERING-NEXT: bl shared_za_call +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB6_3 +; CHECK-NEWLOWERING-NEXT: // %bb.2: // %use_za +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB6_3: // %use_za +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: .LBB6_4: // %exit +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret + br i1 %cond, label %use_za, label %exit + +use_za: + tail call void @shared_za_call() + tail call void @private_za_call() + br label %exit + +exit: + ret void +} + + +define void @diamond_mixed_za_merge_shared(i1 %cond) "aarch64_inout_za" nounwind { +; CHECK-LABEL: diamond_mixed_za_merge_shared: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: tbz w0, #0, .LBB7_2 +; CHECK-NEXT: // %bb.1: // %then +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: b .LBB7_5 +; CHECK-NEXT: .LBB7_2: // %else +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB7_4 +; CHECK-NEXT: // %bb.3: // %else +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB7_4: // %else +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB7_5: // %merge_shared +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: diamond_mixed_za_merge_shared: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB7_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %then +; CHECK-NEWLOWERING-NEXT: bl shared_za_call +; CHECK-NEWLOWERING-NEXT: b .LBB7_5 +; CHECK-NEWLOWERING-NEXT: .LBB7_2: // %else +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB7_4 +; CHECK-NEWLOWERING-NEXT: // %bb.3: // %else +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB7_4: // %else +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: .LBB7_5: // %merge_shared +; CHECK-NEWLOWERING-NEXT: bl shared_za_call +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret +entry: + br i1 %cond, label %then, label %else + +then: + call void @shared_za_call() + br label %merge_shared + +else: + call void @private_za_call() + br label %merge_shared + +merge_shared: + call void @shared_za_call() + ret void +} + + +define void @diamond_mixed_za_merge_private(i1 %cond) "aarch64_inout_za" nounwind { +; CHECK-LABEL: diamond_mixed_za_merge_private: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: tbz w0, #0, .LBB8_2 +; CHECK-NEXT: // %bb.1: // %then +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: b .LBB8_5 +; CHECK-NEXT: .LBB8_2: // %else +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB8_4 +; CHECK-NEXT: // %bb.3: // %else +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB8_4: // %else +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB8_5: // %merge_private_za +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB8_7 +; CHECK-NEXT: // %bb.6: // %merge_private_za +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB8_7: // %merge_private_za +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: diamond_mixed_za_merge_private: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB8_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %then +; CHECK-NEWLOWERING-NEXT: bl shared_za_call +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: b .LBB8_3 +; CHECK-NEWLOWERING-NEXT: .LBB8_2: // %else +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: .LBB8_3: // %merge_private_za +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB8_5 +; CHECK-NEWLOWERING-NEXT: // %bb.4: // %merge_private_za +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB8_5: // %merge_private_za +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret +entry: + br i1 %cond, label %then, label %else + +then: + call void @shared_za_call() + br label %merge_private_za + +else: + call void @private_za_call() + br label %merge_private_za + +merge_private_za: + call void @private_za_call() + ret void +} + +define void @critical_edge_mixed_za(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind { +; CHECK-LABEL: critical_edge_mixed_za: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mov w19, w1 +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: tbz w0, #0, .LBB9_5 +; CHECK-NEXT: // %bb.1: // %shared_path +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: tbz w19, #0, .LBB9_8 +; CHECK-NEXT: .LBB9_2: // %exit_private +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB9_4 +; CHECK-NEXT: // %bb.3: // %exit_private +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB9_4: // %exit_private +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB9_9 +; CHECK-NEXT: .LBB9_5: // %private_path +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB9_7 +; CHECK-NEXT: // %bb.6: // %private_path +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB9_7: // %private_path +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: tbnz w19, #0, .LBB9_2 +; CHECK-NEXT: .LBB9_8: // %exit_shared +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: .LBB9_9: // %common.ret +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: critical_edge_mixed_za: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: mov w19, w1 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB9_5 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %shared_path +; CHECK-NEWLOWERING-NEXT: bl shared_za_call +; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB9_8 +; CHECK-NEWLOWERING-NEXT: .LBB9_2: // %exit_private +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB9_4 +; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit_private +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB9_4: // %exit_private +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: b .LBB9_9 +; CHECK-NEWLOWERING-NEXT: .LBB9_5: // %private_path +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB9_7 +; CHECK-NEWLOWERING-NEXT: // %bb.6: // %private_path +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB9_7: // %private_path +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: tbnz w19, #0, .LBB9_2 +; CHECK-NEWLOWERING-NEXT: .LBB9_8: // %exit_shared +; CHECK-NEWLOWERING-NEXT: bl shared_za_call +; CHECK-NEWLOWERING-NEXT: .LBB9_9: // %common.ret +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret +entry: + br i1 %c1, label %shared_path, label %private_path + +shared_path: + call void @shared_za_call() + br label %merge + +private_path: + call void @private_za_call() + br label %merge + +merge: + br i1 %c2, label %exit_private, label %exit_shared + +exit_private: + call void @private_za_call() + ret void + +exit_shared: + call void @shared_za_call() + ret void +} + +define void @nested_cond_in_loop(i32 %n, i1 %cond) "aarch64_inout_za" nounwind { +; CHECK-LABEL: nested_cond_in_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: str x23, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: cmp w0, #1 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: b.lt .LBB10_8 +; CHECK-NEXT: // %bb.1: // %loop.preheader +; CHECK-NEXT: mov w19, w1 +; CHECK-NEXT: mov w20, w0 +; CHECK-NEXT: mov w21, wzr +; CHECK-NEXT: rdsvl x22, #1 +; CHECK-NEXT: sub x23, x29, #16 +; CHECK-NEXT: b .LBB10_4 +; CHECK-NEXT: .LBB10_2: // %use_shared +; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: .LBB10_3: // %latch +; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-NEXT: add w21, w21, #1 +; CHECK-NEXT: cmp w21, w20 +; CHECK-NEXT: b.ge .LBB10_8 +; CHECK-NEXT: .LBB10_4: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: tbnz w19, #0, .LBB10_2 +; CHECK-NEXT: // %bb.5: // %use_private +; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-NEXT: sturh w22, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x23 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB10_7 +; CHECK-NEXT: // %bb.6: // %use_private +; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB10_7: // %use_private +; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB10_3 +; CHECK-NEXT: .LBB10_8: // %exit +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x23, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: nested_cond_in_loop: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: cmp w0, #1 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: b.lt .LBB10_8 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader +; CHECK-NEWLOWERING-NEXT: mov w19, w1 +; CHECK-NEWLOWERING-NEXT: mov w20, w0 +; CHECK-NEWLOWERING-NEXT: mov w21, wzr +; CHECK-NEWLOWERING-NEXT: sub x22, x29, #16 +; CHECK-NEWLOWERING-NEXT: b .LBB10_4 +; CHECK-NEWLOWERING-NEXT: .LBB10_2: // %use_shared +; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-NEWLOWERING-NEXT: bl shared_za_call +; CHECK-NEWLOWERING-NEXT: .LBB10_3: // %latch +; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-NEWLOWERING-NEXT: add w21, w21, #1 +; CHECK-NEWLOWERING-NEXT: cmp w21, w20 +; CHECK-NEWLOWERING-NEXT: b.ge .LBB10_8 +; CHECK-NEWLOWERING-NEXT: .LBB10_4: // %loop +; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-NEXT: tbnz w19, #0, .LBB10_2 +; CHECK-NEWLOWERING-NEXT: // %bb.5: // %use_private +; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x22 +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB10_7 +; CHECK-NEWLOWERING-NEXT: // %bb.6: // %use_private +; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB10_7: // %use_private +; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: b .LBB10_3 +; CHECK-NEWLOWERING-NEXT: .LBB10_8: // %exit +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret +entry: + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %loop, label %exit + +loop: + %iv = phi i32 [ 0, %entry ], [ %inc, %latch ] + br i1 %cond, label %use_shared, label %use_private + +use_shared: + call void @shared_za_call() + br label %latch + +use_private: + call void @private_za_call() + br label %latch + +latch: + %inc = add i32 %iv, 1 + %cmp2 = icmp slt i32 %inc, %n + br i1 %cmp2, label %loop, label %exit + +exit: + ret void +} + +define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind { +; CHECK-LABEL: loop_with_external_entry: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mov w19, w1 +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: tbz w0, #0, .LBB11_2 +; CHECK-NEXT: // %bb.1: // %init +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: .LBB11_2: // %loop.preheader +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: b .LBB11_4 +; CHECK-NEXT: .LBB11_3: // %loop +; CHECK-NEXT: // in Loop: Header=BB11_4 Depth=1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: tbz w19, #0, .LBB11_6 +; CHECK-NEXT: .LBB11_4: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB11_3 +; CHECK-NEXT: // %bb.5: // %loop +; CHECK-NEXT: // in Loop: Header=BB11_4 Depth=1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: b .LBB11_3 +; CHECK-NEXT: .LBB11_6: // %exit +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: loop_with_external_entry: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: mov w19, w1 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB11_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %init +; CHECK-NEWLOWERING-NEXT: bl shared_za_call +; CHECK-NEWLOWERING-NEXT: .LBB11_2: // %loop.preheader +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: .LBB11_3: // %loop +; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-NEXT: bl private_za_call +; CHECK-NEWLOWERING-NEXT: tbnz w19, #0, .LBB11_3 +; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB11_6 +; CHECK-NEWLOWERING-NEXT: // %bb.5: // %exit +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB11_6: // %exit +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret +entry: + br i1 %c1, label %init, label %loop + +init: + call void @shared_za_call() + br label %loop + +loop: + call void @private_za_call() + br i1 %c2, label %loop, label %exit + +exit: + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll new file mode 100644 index 0000000000000..c497a95a58c8a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll @@ -0,0 +1,288 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s + +; A simple EH test case that corresponds to the following C++ source: +; +; struct ZAResource { +; ~ZAResource() __arm_inout("za") { +; shared_za_call(); // simulate cleanup in destructor +; } +; }; +; +; void za_with_raii(bool fail) __arm_inout("za") { +; ZAResource r; +; if (fail) +; throw "Unwinding needs ZA state reload"; +; } +; +; Here if an exception is thrown we must call the ~ZAResource destructor while +; unwinding the stack. That requires us to restore ZA state before the +; shared_za_call in the cleanup block. + +@.str = private unnamed_addr constant [32 x i8] c"Unwinding needs ZA state reload\00", align 1 +@typeinfo_for_char_const_ptr = external constant ptr + +define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: za_with_raii: +; CHECK: .Lfunc_begin0: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception0 +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: tbnz w0, #0, .LBB0_2 +; CHECK-NEXT: // %bb.1: // %return_normally +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: b shared_za_call +; CHECK-NEXT: .LBB0_2: // %throw_exception +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: mov w0, #8 // =0x8 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_allocate_exception +; CHECK-NEXT: adrp x8, .L.str +; CHECK-NEXT: add x8, x8, :lo12:.L.str +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: adrp x1, :got:typeinfo_for_char_const_ptr +; CHECK-NEXT: mov x2, xzr +; CHECK-NEXT: ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr] +; CHECK-NEXT: bl __cxa_throw +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x9, .LBB0_4 +; CHECK-NEXT: // %bb.3: // %throw_exception +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB0_4: // %throw_exception +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: // kill: def $x0 killed $x8 +; CHECK-NEXT: // %bb.5: // %throw_fail +; CHECK-NEXT: .LBB0_6: // %unwind_dtors +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB0_8 +; CHECK-NEXT: // %bb.7: // %unwind_dtors +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB0_8: // %unwind_dtors +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl _Unwind_Resume + br i1 %fail, label %throw_exception, label %return_normally + +throw_exception: + %exception_ptr = tail call ptr @__cxa_allocate_exception(i64 8) #3 + store ptr @.str, ptr %exception_ptr, align 16 + invoke void @__cxa_throw(ptr nonnull %exception_ptr, ptr nonnull @typeinfo_for_char_const_ptr, ptr null) + to label %throw_fail unwind label %unwind_dtors + +unwind_dtors: + %5 = landingpad { ptr, i32 } + cleanup + tail call void @shared_za_call() + resume { ptr, i32 } %5 + +return_normally: + tail call void @shared_za_call() + ret void + +throw_fail: + unreachable +} + + +; Another simple exception handling example. Here we need to restore ZA in two +; places. After the may_throw() call to handle the case it does not throw, and +; within the catch block for the shared_za_call(). We also need to setup the +; lazy save around C++ exception ABI routines (to handle the _very_ unlikely +; case they use ZA state). +; +; void za_try_catch() __arm_inout("za") { +; try { +; may_throw(); +; } catch (...) { +; shared_za_call(); +; } +; shared_za_call(); +; } +define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: try_catch: +; CHECK: .Lfunc_begin1: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception1 +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl may_throw +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: .LBB1_1: // %after_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB1_3 +; CHECK-NEXT: // %bb.2: // %after_catch +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB1_3: // %after_catch +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: b shared_za_call +; CHECK-NEXT: .LBB1_4: // %catch +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB1_6 +; CHECK-NEXT: // %bb.5: // %catch +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB1_6: // %catch +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: b .LBB1_1 + invoke void @may_throw() + to label %after_catch unwind label %catch + +catch: ; preds = %0 + %eh_info = landingpad { ptr, i32 } + catch ptr null + %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0 + tail call ptr @__cxa_begin_catch(ptr %exception_ptr) + tail call void @shared_za_call() + tail call void @__cxa_end_catch() + br label %after_catch + +after_catch: + tail call void @shared_za_call() + ret void +} + +; This example corresponds to: +; +; __arm_new("za") void try_catch_shared_za_callee() +; { +; try { +; shared_za_call(); +; } catch(...) { +; noexcept_shared_za_call(); +; } +; } +; +; In this example we don't setup a lazy save before shared_za_call(), however, +; we still enter the catch block in a ZA off state. This leads to us emitting a +; restore of a uninitialized save buffer in the catch block. This is not ideal +; but is valid in the SME ABI. Ideally, we would omit the save buffer and +; restore and simply set ZA to "on" in the catch block. + +define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: try_catch_shared_za_callee: +; CHECK: .Lfunc_begin2: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception2 +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbz x8, .LBB2_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: zero {za} +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: smstart za +; CHECK-NEXT: .Ltmp6: +; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: .Ltmp7: +; CHECK-NEXT: .LBB2_3: // %exit +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_4: // %catch +; CHECK-NEXT: .Ltmp8: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB2_6 +; CHECK-NEXT: // %bb.5: // %catch +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB2_6: // %catch +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: bl noexcept_shared_za_call +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB2_3 + invoke void @shared_za_call() #4 + to label %exit unwind label %catch +catch: + %eh_info = landingpad { ptr, i32 } + catch ptr null + %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0 + tail call ptr @__cxa_begin_catch(ptr %exception_ptr) + tail call void @noexcept_shared_za_call() + tail call void @__cxa_end_catch() + br label %exit + +exit: + ret void +} + +declare ptr @__cxa_allocate_exception(i64) +declare void @__cxa_throw(ptr, ptr, ptr) +declare ptr @__cxa_begin_catch(ptr) +declare void @__cxa_end_catch() +declare i32 @__gxx_personality_v0(...) + +declare void @may_throw() +declare void @shared_za_call() "aarch64_inout_za" +declare void @noexcept_shared_za_call() "aarch64_inout_za" diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index ad3f7f5514d0e..a9ad6f695cf8f 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -1,11 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING define i32 @no_tpidr2_save_required() "aarch64_inout_za" { -; CHECK-LABEL: no_tpidr2_save_required: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w0, #42 // =0x2a -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: no_tpidr2_save_required: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: mov w0, #42 // =0x2a +; CHECK-COMMON-NEXT: ret entry: ret i32 42 } @@ -50,6 +51,42 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6 ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: multi_bb_stpidr2_save_required: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -8 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB1_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b +; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000 +; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1 +; CHECK-NEWLOWERING-NEXT: b .LBB1_3 +; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %use_c +; CHECK-NEWLOWERING-NEXT: fmov s0, s1 +; CHECK-NEWLOWERING-NEXT: bl cosf +; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %exit +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_5 +; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret %cmp = icmp ne i32 %a, 0 br i1 %cmp, label %use_b, label %use_c @@ -66,6 +103,7 @@ exit: ret float %ret } +; FIXME: This is missing stack probes with -aarch64-new-sme-abi. define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" { ; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe: ; CHECK: // %bb.0: @@ -115,6 +153,42 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: multi_bb_stpidr2_save_required_stackprobe: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: str xzr, [sp, #-16]! +; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -8 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b +; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000 +; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1 +; CHECK-NEWLOWERING-NEXT: b .LBB2_3 +; CHECK-NEWLOWERING-NEXT: .LBB2_2: // %use_c +; CHECK-NEWLOWERING-NEXT: fmov s0, s1 +; CHECK-NEWLOWERING-NEXT: bl cosf +; CHECK-NEWLOWERING-NEXT: .LBB2_3: // %exit +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_5 +; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %exit +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret %cmp = icmp ne i32 %a, 0 br i1 %cmp, label %use_b, label %use_c diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index 63577e4d217a8..57c1ced8ab125 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING ; ; Private-ZA Callee @@ -8,19 +9,19 @@ ; Expect spill & fill of ZT0 around call ; Expect smstop/smstart za around call define void @zt0_in_caller_no_state_callee(ptr %callee) "aarch64_in_zt0" nounwind { -; CHECK-LABEL: zt0_in_caller_no_state_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: mov x19, sp -; CHECK-NEXT: str zt0, [x19] -; CHECK-NEXT: smstop za -; CHECK-NEXT: blr x0 -; CHECK-NEXT: smstart za -; CHECK-NEXT: ldr zt0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #80 -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: zt0_in_caller_no_state_callee: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: sub sp, sp, #80 +; CHECK-COMMON-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x19, sp +; CHECK-COMMON-NEXT: str zt0, [x19] +; CHECK-COMMON-NEXT: smstop za +; CHECK-COMMON-NEXT: blr x0 +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: ldr zt0, [x19] +; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: add sp, sp, #80 +; CHECK-COMMON-NEXT: ret call void %callee(); ret void; } @@ -61,6 +62,36 @@ define void @za_zt0_shared_caller_no_state_callee(ptr %callee) "aarch64_inout_za ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: za_zt0_shared_caller_no_state_callee: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: sub x19, x29, #64 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #80 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEWLOWERING-NEXT: str zt0, [x19] +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEWLOWERING-NEXT: blr x0 +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #80 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB1_2: +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: ldr zt0, [x19] +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret call void %callee(); ret void; } @@ -71,41 +102,41 @@ define void @za_zt0_shared_caller_no_state_callee(ptr %callee) "aarch64_inout_za ; Caller and callee have shared ZT0 state, no spill/fill of ZT0 required define void @zt0_shared_caller_zt0_shared_callee(ptr %callee) "aarch64_in_zt0" nounwind { -; CHECK-LABEL: zt0_shared_caller_zt0_shared_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: blr x0 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: zt0_shared_caller_zt0_shared_callee: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: blr x0 +; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret call void %callee() "aarch64_in_zt0"; ret void; } ; Expect spill & fill of ZT0 around call define void @za_zt0_shared_caller_za_shared_callee(ptr %callee) "aarch64_inout_za" "aarch64_in_zt0" nounwind { -; CHECK-LABEL: za_zt0_shared_caller_za_shared_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: mov x19, sp -; CHECK-NEXT: str zt0, [x19] -; CHECK-NEXT: blr x0 -; CHECK-NEXT: ldr zt0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #80 -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: za_zt0_shared_caller_za_shared_callee: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: sub sp, sp, #80 +; CHECK-COMMON-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x19, sp +; CHECK-COMMON-NEXT: str zt0, [x19] +; CHECK-COMMON-NEXT: blr x0 +; CHECK-COMMON-NEXT: ldr zt0, [x19] +; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: add sp, sp, #80 +; CHECK-COMMON-NEXT: ret call void %callee() "aarch64_inout_za"; ret void; } ; Caller and callee have shared ZA & ZT0 define void @za_zt0_shared_caller_za_zt0_shared_callee(ptr %callee) "aarch64_inout_za" "aarch64_in_zt0" nounwind { -; CHECK-LABEL: za_zt0_shared_caller_za_zt0_shared_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: blr x0 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: za_zt0_shared_caller_za_zt0_shared_callee: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: blr x0 +; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret call void %callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; } @@ -115,19 +146,19 @@ define void @za_zt0_shared_caller_za_zt0_shared_callee(ptr %callee) "aarch64_ino ; Expect spill & fill of ZT0 around call ; Expect smstop/smstart za around call define void @zt0_in_caller_zt0_new_callee(ptr %callee) "aarch64_in_zt0" nounwind { -; CHECK-LABEL: zt0_in_caller_zt0_new_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: mov x19, sp -; CHECK-NEXT: str zt0, [x19] -; CHECK-NEXT: smstop za -; CHECK-NEXT: blr x0 -; CHECK-NEXT: smstart za -; CHECK-NEXT: ldr zt0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #80 -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: zt0_in_caller_zt0_new_callee: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: sub sp, sp, #80 +; CHECK-COMMON-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x19, sp +; CHECK-COMMON-NEXT: str zt0, [x19] +; CHECK-COMMON-NEXT: smstop za +; CHECK-COMMON-NEXT: blr x0 +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: ldr zt0, [x19] +; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: add sp, sp, #80 +; CHECK-COMMON-NEXT: ret call void %callee() "aarch64_new_zt0"; ret void; } @@ -161,6 +192,29 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwi ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: zt0_new_caller_zt0_new_callee: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80 +; CHECK-NEWLOWERING-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB6_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: .LBB6_2: +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: zero { zt0 } +; CHECK-NEWLOWERING-NEXT: mov x19, sp +; CHECK-NEWLOWERING-NEXT: str zt0, [x19] +; CHECK-NEWLOWERING-NEXT: smstop za +; CHECK-NEWLOWERING-NEXT: blr x0 +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: ldr zt0, [x19] +; CHECK-NEWLOWERING-NEXT: smstop za +; CHECK-NEWLOWERING-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: add sp, sp, #80 +; CHECK-NEWLOWERING-NEXT: ret call void %callee() "aarch64_new_zt0"; ret void; } @@ -191,6 +245,27 @@ define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind { ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: zt0_new_caller_abi_routine_callee: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80 +; CHECK-NEWLOWERING-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB7_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: .LBB7_2: +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: zero { zt0 } +; CHECK-NEWLOWERING-NEXT: mov x19, sp +; CHECK-NEWLOWERING-NEXT: str zt0, [x19] +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state +; CHECK-NEWLOWERING-NEXT: ldr zt0, [x19] +; CHECK-NEWLOWERING-NEXT: smstop za +; CHECK-NEWLOWERING-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: add sp, sp, #80 +; CHECK-NEWLOWERING-NEXT: ret %res = call {i64, i64} @__arm_sme_state() %res.0 = extractvalue {i64, i64} %res, 0 ret i64 %res.0 @@ -221,6 +296,22 @@ define void @zt0_new_caller(ptr %callee) "aarch64_new_zt0" nounwind { ; CHECK-NEXT: smstop za ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: zt0_new_caller: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB8_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: .LBB8_2: +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: zero { zt0 } +; CHECK-NEWLOWERING-NEXT: blr x0 +; CHECK-NEWLOWERING-NEXT: smstop za +; CHECK-NEWLOWERING-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret call void %callee() "aarch64_in_zt0"; ret void; } @@ -245,6 +336,23 @@ define void @new_za_zt0_caller(ptr %callee) "aarch64_new_za" "aarch64_new_zt0" n ; CHECK-NEXT: smstop za ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: new_za_zt0_caller: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB9_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: zero {za} +; CHECK-NEWLOWERING-NEXT: .LBB9_2: +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: zero { zt0 } +; CHECK-NEWLOWERING-NEXT: blr x0 +; CHECK-NEWLOWERING-NEXT: smstop za +; CHECK-NEWLOWERING-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret call void %callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; } @@ -258,19 +366,26 @@ define void @new_za_shared_zt0_caller(ptr %callee) "aarch64_new_za" "aarch64_in_ ; CHECK-NEXT: blr x0 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: new_za_shared_zt0_caller: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: blr x0 +; CHECK-NEWLOWERING-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret call void %callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; } ; Expect clear ZT0 on entry define void @shared_za_new_zt0(ptr %callee) "aarch64_inout_za" "aarch64_new_zt0" nounwind { -; CHECK-LABEL: shared_za_new_zt0: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: zero { zt0 } -; CHECK-NEXT: blr x0 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: shared_za_new_zt0: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: zero { zt0 } +; CHECK-COMMON-NEXT: blr x0 +; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret call void %callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; } diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll index d440535f022c4..e8f16e64ce4b0 100644 --- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-streaming-hazard-size=0 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-streaming-hazard-size=0 | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-streaming-hazard-size=0 -aarch64-new-sme-abi | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-streaming-hazard-size=0 -pass-remarks-analysis=stack-frame-layout 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK-FRAMELAYOUT ; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64 @@ -10,28 +11,29 @@ ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 8, Size: 8 define i32 @csr_d8_allocnxv4i32i32f64(double %d) "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: csr_d8_allocnxv4i32i32f64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #8] // 8-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x20, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 32 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -8 -; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: add x8, sp, #16 -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: //APP -; CHECK-NEXT: //NO_APP -; CHECK-NEXT: str wzr, [sp, #12] -; CHECK-NEXT: str d0, [sp] -; CHECK-NEXT: str z1, [x8] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldr x29, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: csr_d8_allocnxv4i32i32f64: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: str x29, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: addvl sp, sp, #-1 +; CHECK-COMMON-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x20, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 32 + 8 * VG +; CHECK-COMMON-NEXT: .cfi_offset w29, -8 +; CHECK-COMMON-NEXT: .cfi_offset b8, -16 +; CHECK-COMMON-NEXT: mov z1.s, #0 // =0x0 +; CHECK-COMMON-NEXT: add x8, sp, #16 +; CHECK-COMMON-NEXT: mov w0, wzr +; CHECK-COMMON-NEXT: //APP +; CHECK-COMMON-NEXT: //NO_APP +; CHECK-COMMON-NEXT: str wzr, [sp, #12] +; CHECK-COMMON-NEXT: str d0, [sp] +; CHECK-COMMON-NEXT: str z1, [x8] +; CHECK-COMMON-NEXT: addvl sp, sp, #1 +; CHECK-COMMON-NEXT: add sp, sp, #16 +; CHECK-COMMON-NEXT: ldr x29, [sp, #8] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret +; CHECK-COMMON-NE entry: %a = alloca %b = alloca i32 @@ -52,31 +54,31 @@ entry: ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40-16 x vscale], Type: Variable, Align: 8, Size: 8 define i32 @csr_d8_allocnxv4i32i32f64_fp(double %d) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { -; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_fp: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: add x29, sp, #16 -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_offset b8, -32 -; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: addvl x8, sp, #1 -; CHECK-NEXT: //APP -; CHECK-NEXT: //NO_APP -; CHECK-NEXT: str wzr, [x8, #28] -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: str d0, [sp, #8] -; CHECK-NEXT: str z1, [x8, #-1, mul vl] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: csr_d8_allocnxv4i32i32f64_fp: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #16] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: add x29, sp, #16 +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: addvl sp, sp, #-1 +; CHECK-COMMON-NEXT: .cfi_def_cfa w29, 16 +; CHECK-COMMON-NEXT: .cfi_offset w30, -8 +; CHECK-COMMON-NEXT: .cfi_offset w29, -16 +; CHECK-COMMON-NEXT: .cfi_offset b8, -32 +; CHECK-COMMON-NEXT: mov z1.s, #0 // =0x0 +; CHECK-COMMON-NEXT: addvl x8, sp, #1 +; CHECK-COMMON-NEXT: //APP +; CHECK-COMMON-NEXT: //NO_APP +; CHECK-COMMON-NEXT: str wzr, [x8, #28] +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: mov w0, wzr +; CHECK-COMMON-NEXT: str d0, [sp, #8] +; CHECK-COMMON-NEXT: str z1, [x8, #-1, mul vl] +; CHECK-COMMON-NEXT: addvl sp, sp, #1 +; CHECK-COMMON-NEXT: add sp, sp, #16 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret entry: %a = alloca %b = alloca i32 @@ -102,30 +104,30 @@ entry: ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-128-16 x vscale], Type: Variable, Align: 128, Size: 4 define i32 @csr_d8_allocnxv4i32i32f64_dynamicrealign(double %d) "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_dynamicrealign: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill -; CHECK-NEXT: sub x9, sp, #96 -; CHECK-NEXT: stp x29, x30, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: add x29, sp, #16 -; CHECK-NEXT: addvl x9, x9, #-1 -; CHECK-NEXT: and sp, x9, #0xffffffffffffff80 -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_offset b8, -32 -; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: //APP -; CHECK-NEXT: //NO_APP -; CHECK-NEXT: str wzr, [sp] -; CHECK-NEXT: stur d0, [x29, #-8] -; CHECK-NEXT: str z1, [x8, #-1, mul vl] -; CHECK-NEXT: sub sp, x29, #16 -; CHECK-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: csr_d8_allocnxv4i32i32f64_dynamicrealign: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: sub x9, sp, #96 +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #16] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: add x29, sp, #16 +; CHECK-COMMON-NEXT: addvl x9, x9, #-1 +; CHECK-COMMON-NEXT: and sp, x9, #0xffffffffffffff80 +; CHECK-COMMON-NEXT: .cfi_def_cfa w29, 16 +; CHECK-COMMON-NEXT: .cfi_offset w30, -8 +; CHECK-COMMON-NEXT: .cfi_offset w29, -16 +; CHECK-COMMON-NEXT: .cfi_offset b8, -32 +; CHECK-COMMON-NEXT: mov z1.s, #0 // =0x0 +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: mov w0, wzr +; CHECK-COMMON-NEXT: //APP +; CHECK-COMMON-NEXT: //NO_APP +; CHECK-COMMON-NEXT: str wzr, [sp] +; CHECK-COMMON-NEXT: stur d0, [x29, #-8] +; CHECK-COMMON-NEXT: str z1, [x8, #-1, mul vl] +; CHECK-COMMON-NEXT: sub sp, x29, #16 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret entry: %a = alloca %b = alloca i32, align 128 @@ -151,44 +153,44 @@ entry: ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-16 x vscale], Type: VariableSized, Align: 1, Size: 0 define i32 @csr_d8_allocnxv4i32i32f64_vla(double %d, i32 %i) "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_vla: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #8] // 16-byte Folded Spill -; CHECK-NEXT: add x29, sp, #8 -; CHECK-NEXT: str x19, [sp, #24] // 8-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: mov x19, sp -; CHECK-NEXT: .cfi_def_cfa w29, 24 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: .cfi_offset w29, -24 -; CHECK-NEXT: .cfi_offset b8, -32 -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: ubfiz x8, x0, #2, #32 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: add x8, x8, #15 -; CHECK-NEXT: and x8, x8, #0x7fffffff0 -; CHECK-NEXT: sub x9, x9, x8 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: sub x8, x10, x8 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: //APP -; CHECK-NEXT: //NO_APP -; CHECK-NEXT: str wzr, [x8] -; CHECK-NEXT: sub x8, x29, #8 -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: str wzr, [x9] -; CHECK-NEXT: str d0, [x19, #8] -; CHECK-NEXT: str z1, [x8, #-1, mul vl] -; CHECK-NEXT: sub sp, x29, #8 -; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload -; CHECK-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: csr_d8_allocnxv4i32i32f64_vla: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #8] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: add x29, sp, #8 +; CHECK-COMMON-NEXT: str x19, [sp, #24] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: addvl sp, sp, #-1 +; CHECK-COMMON-NEXT: mov x19, sp +; CHECK-COMMON-NEXT: .cfi_def_cfa w29, 24 +; CHECK-COMMON-NEXT: .cfi_offset w19, -8 +; CHECK-COMMON-NEXT: .cfi_offset w30, -16 +; CHECK-COMMON-NEXT: .cfi_offset w29, -24 +; CHECK-COMMON-NEXT: .cfi_offset b8, -32 +; CHECK-COMMON-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-COMMON-NEXT: ubfiz x8, x0, #2, #32 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: add x8, x8, #15 +; CHECK-COMMON-NEXT: and x8, x8, #0x7fffffff0 +; CHECK-COMMON-NEXT: sub x9, x9, x8 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: mov x10, sp +; CHECK-COMMON-NEXT: sub x8, x10, x8 +; CHECK-COMMON-NEXT: mov sp, x8 +; CHECK-COMMON-NEXT: mov z1.s, #0 // =0x0 +; CHECK-COMMON-NEXT: //APP +; CHECK-COMMON-NEXT: //NO_APP +; CHECK-COMMON-NEXT: str wzr, [x8] +; CHECK-COMMON-NEXT: sub x8, x29, #8 +; CHECK-COMMON-NEXT: mov w0, wzr +; CHECK-COMMON-NEXT: str wzr, [x9] +; CHECK-COMMON-NEXT: str d0, [x19, #8] +; CHECK-COMMON-NEXT: str z1, [x8, #-1, mul vl] +; CHECK-COMMON-NEXT: sub sp, x29, #8 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret entry: %a = alloca %0 = zext i32 %i to i64 @@ -213,28 +215,28 @@ entry: ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 8, Size: 8 define i32 @csr_d8_allocnxv4i32i32f64_stackargsi32f64(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8) "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_stackargsi32f64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #8] // 8-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x20, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 32 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -8 -; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: add x8, sp, #16 -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: //APP -; CHECK-NEXT: //NO_APP -; CHECK-NEXT: str wzr, [sp, #12] -; CHECK-NEXT: str d0, [sp] -; CHECK-NEXT: str z1, [x8] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldr x29, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: csr_d8_allocnxv4i32i32f64_stackargsi32f64: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: str x29, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: addvl sp, sp, #-1 +; CHECK-COMMON-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x20, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 32 + 8 * VG +; CHECK-COMMON-NEXT: .cfi_offset w29, -8 +; CHECK-COMMON-NEXT: .cfi_offset b8, -16 +; CHECK-COMMON-NEXT: mov z1.s, #0 // =0x0 +; CHECK-COMMON-NEXT: add x8, sp, #16 +; CHECK-COMMON-NEXT: mov w0, wzr +; CHECK-COMMON-NEXT: //APP +; CHECK-COMMON-NEXT: //NO_APP +; CHECK-COMMON-NEXT: str wzr, [sp, #12] +; CHECK-COMMON-NEXT: str d0, [sp] +; CHECK-COMMON-NEXT: str z1, [x8] +; CHECK-COMMON-NEXT: addvl sp, sp, #1 +; CHECK-COMMON-NEXT: add sp, sp, #16 +; CHECK-COMMON-NEXT: ldr x29, [sp, #8] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret entry: %a = alloca %b = alloca i32 @@ -255,29 +257,29 @@ entry: ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-32 x vscale], Type: Variable, Align: 8, Size: 8 define i32 @svecc_z8_allocnxv4i32i32f64_fp(double %d, %v) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { -; CHECK-LABEL: svecc_z8_allocnxv4i32i32f64_fp: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16 -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: //APP -; CHECK-NEXT: //NO_APP -; CHECK-NEXT: str wzr, [sp, #12] -; CHECK-NEXT: str z1, [x29, #-2, mul vl] -; CHECK-NEXT: str d0, [sp], #16 -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: svecc_z8_allocnxv4i32i32f64_fp: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: addvl sp, sp, #-1 +; CHECK-COMMON-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: addvl sp, sp, #-1 +; CHECK-COMMON-NEXT: .cfi_def_cfa w29, 16 +; CHECK-COMMON-NEXT: .cfi_offset w30, -8 +; CHECK-COMMON-NEXT: .cfi_offset w29, -16 +; CHECK-COMMON-NEXT: .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16 +; CHECK-COMMON-NEXT: mov w0, wzr +; CHECK-COMMON-NEXT: //APP +; CHECK-COMMON-NEXT: //NO_APP +; CHECK-COMMON-NEXT: str wzr, [sp, #12] +; CHECK-COMMON-NEXT: str z1, [x29, #-2, mul vl] +; CHECK-COMMON-NEXT: str d0, [sp], #16 +; CHECK-COMMON-NEXT: addvl sp, sp, #1 +; CHECK-COMMON-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: addvl sp, sp, #1 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret entry: %a = alloca %b = alloca i32 @@ -299,29 +301,29 @@ entry: ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-32 x vscale], Type: Variable, Align: 8, Size: 8 define i32 @svecc_z8_allocnxv4i32i32f64_stackargsi32_fp(double %d, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, %v) "aarch64_pstate_sm_compatible" "frame-pointer"="all"{ -; CHECK-LABEL: svecc_z8_allocnxv4i32i32f64_stackargsi32_fp: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16 -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: //APP -; CHECK-NEXT: //NO_APP -; CHECK-NEXT: str wzr, [sp, #12] -; CHECK-NEXT: str z1, [x29, #-2, mul vl] -; CHECK-NEXT: str d0, [sp], #16 -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: svecc_z8_allocnxv4i32i32f64_stackargsi32_fp: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: addvl sp, sp, #-1 +; CHECK-COMMON-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: addvl sp, sp, #-1 +; CHECK-COMMON-NEXT: .cfi_def_cfa w29, 16 +; CHECK-COMMON-NEXT: .cfi_offset w30, -8 +; CHECK-COMMON-NEXT: .cfi_offset w29, -16 +; CHECK-COMMON-NEXT: .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16 +; CHECK-COMMON-NEXT: mov w0, wzr +; CHECK-COMMON-NEXT: //APP +; CHECK-COMMON-NEXT: //NO_APP +; CHECK-COMMON-NEXT: str wzr, [sp, #12] +; CHECK-COMMON-NEXT: str z1, [x29, #-2, mul vl] +; CHECK-COMMON-NEXT: str d0, [sp], #16 +; CHECK-COMMON-NEXT: addvl sp, sp, #1 +; CHECK-COMMON-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: addvl sp, sp, #1 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret entry: %a = alloca %b = alloca i32 @@ -370,125 +372,125 @@ entry: ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-280 x vscale], Type: Spill, Align: 2, Size: vscale x 2 define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, i16 %P4) "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: svecc_call: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x27, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w27, -16 -; CHECK-NEXT: .cfi_offset w28, -24 -; CHECK-NEXT: .cfi_offset w30, -40 -; CHECK-NEXT: .cfi_offset w29, -48 -; CHECK-NEXT: addvl sp, sp, #-18 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x30, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 48 + 144 * VG -; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d8 @ cfa - 8 * VG - 48 -; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d9 @ cfa - 16 * VG - 48 -; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d10 @ cfa - 24 * VG - 48 -; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d11 @ cfa - 32 * VG - 48 -; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d12 @ cfa - 40 * VG - 48 -; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d13 @ cfa - 48 * VG - 48 -; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * VG - 48 -; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * VG - 48 -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: //APP -; CHECK-NEXT: //NO_APP -; CHECK-NEXT: .cfi_offset vg, -32 -; CHECK-NEXT: tbz w19, #0, .LBB7_2 -; CHECK-NEXT: // %bb.1: // %entry -; CHECK-NEXT: smstop sm -; CHECK-NEXT: .LBB7_2: // %entry -; CHECK-NEXT: mov x0, x8 -; CHECK-NEXT: mov w1, #45 // =0x2d -; CHECK-NEXT: mov w2, #37 // =0x25 -; CHECK-NEXT: bl memset -; CHECK-NEXT: tbz w19, #0, .LBB7_4 -; CHECK-NEXT: // %bb.3: // %entry -; CHECK-NEXT: smstart sm -; CHECK-NEXT: .LBB7_4: // %entry -; CHECK-NEXT: mov w0, #22647 // =0x5877 -; CHECK-NEXT: movk w0, #59491, lsl #16 -; CHECK-NEXT: .cfi_restore vg -; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #18 -; CHECK-NEXT: .cfi_def_cfa wsp, 48 -; CHECK-NEXT: .cfi_restore z8 -; CHECK-NEXT: .cfi_restore z9 -; CHECK-NEXT: .cfi_restore z10 -; CHECK-NEXT: .cfi_restore z11 -; CHECK-NEXT: .cfi_restore z12 -; CHECK-NEXT: .cfi_restore z13 -; CHECK-NEXT: .cfi_restore z14 -; CHECK-NEXT: .cfi_restore z15 -; CHECK-NEXT: ldp x27, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w19 -; CHECK-NEXT: .cfi_restore w27 -; CHECK-NEXT: .cfi_restore w28 -; CHECK-NEXT: .cfi_restore w30 -; CHECK-NEXT: .cfi_restore w29 -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: svecc_call: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: .cfi_def_cfa_offset 48 +; CHECK-COMMON-NEXT: cntd x9 +; CHECK-COMMON-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp x27, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: .cfi_offset w19, -8 +; CHECK-COMMON-NEXT: .cfi_offset w27, -16 +; CHECK-COMMON-NEXT: .cfi_offset w28, -24 +; CHECK-COMMON-NEXT: .cfi_offset w30, -40 +; CHECK-COMMON-NEXT: .cfi_offset w29, -48 +; CHECK-COMMON-NEXT: addvl sp, sp, #-18 +; CHECK-COMMON-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x30, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 48 + 144 * VG +; CHECK-COMMON-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-COMMON-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-COMMON-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-COMMON-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-COMMON-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-COMMON-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-COMMON-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-COMMON-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-COMMON-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-COMMON-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-COMMON-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-COMMON-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-COMMON-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d8 @ cfa - 8 * VG - 48 +; CHECK-COMMON-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d9 @ cfa - 16 * VG - 48 +; CHECK-COMMON-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d10 @ cfa - 24 * VG - 48 +; CHECK-COMMON-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d11 @ cfa - 32 * VG - 48 +; CHECK-COMMON-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d12 @ cfa - 40 * VG - 48 +; CHECK-COMMON-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d13 @ cfa - 48 * VG - 48 +; CHECK-COMMON-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * VG - 48 +; CHECK-COMMON-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * VG - 48 +; CHECK-COMMON-NEXT: mov x8, x0 +; CHECK-COMMON-NEXT: bl __arm_sme_state +; CHECK-COMMON-NEXT: mov x19, x0 +; CHECK-COMMON-NEXT: //APP +; CHECK-COMMON-NEXT: //NO_APP +; CHECK-COMMON-NEXT: .cfi_offset vg, -32 +; CHECK-COMMON-NEXT: tbz w19, #0, .LBB7_2 +; CHECK-COMMON-NEXT: // %bb.1: // %entry +; CHECK-COMMON-NEXT: smstop sm +; CHECK-COMMON-NEXT: .LBB7_2: // %entry +; CHECK-COMMON-NEXT: mov x0, x8 +; CHECK-COMMON-NEXT: mov w1, #45 // =0x2d +; CHECK-COMMON-NEXT: mov w2, #37 // =0x25 +; CHECK-COMMON-NEXT: bl memset +; CHECK-COMMON-NEXT: tbz w19, #0, .LBB7_4 +; CHECK-COMMON-NEXT: // %bb.3: // %entry +; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: .LBB7_4: // %entry +; CHECK-COMMON-NEXT: mov w0, #22647 // =0x5877 +; CHECK-COMMON-NEXT: movk w0, #59491, lsl #16 +; CHECK-COMMON-NEXT: .cfi_restore vg +; CHECK-COMMON-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-COMMON-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-COMMON-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-COMMON-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-COMMON-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-COMMON-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-COMMON-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-COMMON-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-COMMON-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-COMMON-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-COMMON-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-COMMON-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-COMMON-NEXT: addvl sp, sp, #18 +; CHECK-COMMON-NEXT: .cfi_def_cfa wsp, 48 +; CHECK-COMMON-NEXT: .cfi_restore z8 +; CHECK-COMMON-NEXT: .cfi_restore z9 +; CHECK-COMMON-NEXT: .cfi_restore z10 +; CHECK-COMMON-NEXT: .cfi_restore z11 +; CHECK-COMMON-NEXT: .cfi_restore z12 +; CHECK-COMMON-NEXT: .cfi_restore z13 +; CHECK-COMMON-NEXT: .cfi_restore z14 +; CHECK-COMMON-NEXT: .cfi_restore z15 +; CHECK-COMMON-NEXT: ldp x27, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: .cfi_def_cfa_offset 0 +; CHECK-COMMON-NEXT: .cfi_restore w19 +; CHECK-COMMON-NEXT: .cfi_restore w27 +; CHECK-COMMON-NEXT: .cfi_restore w28 +; CHECK-COMMON-NEXT: .cfi_restore w30 +; CHECK-COMMON-NEXT: .cfi_restore w29 +; CHECK-COMMON-NEXT: ret entry: tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37) @@ -590,6 +592,79 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK-NEXT: .cfi_restore b14 ; CHECK-NEXT: .cfi_restore b15 ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: vastate: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 112 +; CHECK-NEWLOWERING-NEXT: cntd x9 +; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: add x29, sp, #64 +; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w19, -8 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w20, -16 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -40 +; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -48 +; CHECK-NEWLOWERING-NEXT: .cfi_offset b8, -56 +; CHECK-NEWLOWERING-NEXT: .cfi_offset b9, -64 +; CHECK-NEWLOWERING-NEXT: .cfi_offset b10, -72 +; CHECK-NEWLOWERING-NEXT: .cfi_offset b11, -80 +; CHECK-NEWLOWERING-NEXT: .cfi_offset b12, -88 +; CHECK-NEWLOWERING-NEXT: .cfi_offset b13, -96 +; CHECK-NEWLOWERING-NEXT: .cfi_offset b14, -104 +; CHECK-NEWLOWERING-NEXT: .cfi_offset b15, -112 +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 +; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #80 +; CHECK-NEWLOWERING-NEXT: mov w20, w0 +; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEWLOWERING-NEXT: .cfi_offset vg, -32 +; CHECK-NEWLOWERING-NEXT: smstop sm +; CHECK-NEWLOWERING-NEXT: bl other +; CHECK-NEWLOWERING-NEXT: smstart sm +; CHECK-NEWLOWERING-NEXT: mov w0, w20 +; CHECK-NEWLOWERING-NEXT: mov w8, w0 +; CHECK-NEWLOWERING-NEXT: .cfi_restore vg +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: sub x0, x29, #80 +; CHECK-NEWLOWERING-NEXT: cbnz x9, .LBB8_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: // %entry +; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: .LBB8_2: // %entry +; CHECK-NEWLOWERING-NEXT: mov w0, w8 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 +; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa wsp, 112 +; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEWLOWERING-NEXT: .cfi_restore w19 +; CHECK-NEWLOWERING-NEXT: .cfi_restore w20 +; CHECK-NEWLOWERING-NEXT: .cfi_restore w30 +; CHECK-NEWLOWERING-NEXT: .cfi_restore w29 +; CHECK-NEWLOWERING-NEXT: .cfi_restore b8 +; CHECK-NEWLOWERING-NEXT: .cfi_restore b9 +; CHECK-NEWLOWERING-NEXT: .cfi_restore b10 +; CHECK-NEWLOWERING-NEXT: .cfi_restore b11 +; CHECK-NEWLOWERING-NEXT: .cfi_restore b12 +; CHECK-NEWLOWERING-NEXT: .cfi_restore b13 +; CHECK-NEWLOWERING-NEXT: .cfi_restore b14 +; CHECK-NEWLOWERING-NEXT: .cfi_restore b15 +; CHECK-NEWLOWERING-NEXT: ret entry: tail call void @other() ret i32 %x diff --git a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp index e90f733d79fca..bd0e53c69622e 100644 --- a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp +++ b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp @@ -309,7 +309,7 @@ TEST(SMEAttributes, Transitions) { // Shared ZA -> Private ZA Interface ASSERT_FALSE(CA(ZA_Shared, Private_ZA).requiresDisablingZABeforeCall()); - ASSERT_TRUE(CA(ZA_Shared, Private_ZA).requiresEnablingZAAfterCall()); + ASSERT_FALSE(CA(ZA_Shared, Private_ZA).requiresEnablingZAAfterCall()); // Shared ZT0 -> Private ZA Interface ASSERT_TRUE(CA(ZT0_Shared, Private_ZA).requiresDisablingZABeforeCall()); @@ -328,7 +328,7 @@ TEST(SMEAttributes, Transitions) { // Shared ZA & ZT0 -> Private ZA Interface ASSERT_FALSE(CA(ZA_ZT0_Shared, Private_ZA).requiresDisablingZABeforeCall()); ASSERT_TRUE(CA(ZA_ZT0_Shared, Private_ZA).requiresPreservingZT0()); - ASSERT_TRUE(CA(ZA_ZT0_Shared, Private_ZA).requiresEnablingZAAfterCall()); + ASSERT_FALSE(CA(ZA_ZT0_Shared, Private_ZA).requiresEnablingZAAfterCall()); // Shared ZA -> Shared ZA Interface ASSERT_FALSE(CA(ZA_Shared, ZT0_Shared).requiresDisablingZABeforeCall());