diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 01223f12bb474..8a69d4c9b449f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8292,7 +8292,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Subtarget->hasCustomCallingConv()) Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); - if (getTM().useNewSMEABILowering() && !Attrs.hasAgnosticZAInterface()) { + if (getTM().useNewSMEABILowering()) { if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) { SDValue Size; if (Attrs.hasZAState()) { @@ -9113,9 +9113,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool UseNewSMEABILowering = getTM().useNewSMEABILowering(); bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface(); auto ZAMarkerNode = [&]() -> std::optional { - // TODO: Handle agnostic ZA functions. - if (!UseNewSMEABILowering || IsAgnosticZAFunction) + if (!UseNewSMEABILowering) + return std::nullopt; + if (IsAgnosticZAFunction) { + if (CallAttrs.requiresPreservingAllZAState()) + return AArch64ISD::REQUIRES_ZA_SAVE; return std::nullopt; + } if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State()) return std::nullopt; return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE @@ -9195,7 +9199,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, }; bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave(); - bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState(); + bool RequiresSaveAllZA = + !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState(); if (RequiresLazySave) { const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); MachinePointerInfo MPI = diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index e8cbeacb98192..0e85a5279d2b0 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // // This pass implements the SME ABI requirements for ZA state. This includes -// implementing the lazy ZA state save schemes around calls. +// implementing the lazy (and agnostic) ZA state save schemes around calls. // //===----------------------------------------------------------------------===// // @@ -200,7 +200,7 @@ struct MachineSMEABI : public MachineFunctionPass { /// Inserts code to handle changes between ZA states within the function. /// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA. - void insertStateChanges(); + void insertStateChanges(bool IsAgnosticZA); // Emission routines for private and shared ZA functions (using lazy saves). void emitNewZAPrologue(MachineBasicBlock &MBB, @@ -215,8 +215,41 @@ struct MachineSMEABI : public MachineFunctionPass { void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool ClearTPIDR2); + // Emission routines for agnostic ZA functions. + void emitSetupFullZASave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs); + void emitFullZASaveRestore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs, bool IsSave); + void emitAllocateFullZASaveBuffer(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs); + void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - ZAState From, ZAState To, LiveRegs PhysLiveRegs); + ZAState From, ZAState To, LiveRegs PhysLiveRegs, + bool IsAgnosticZA); + + // Helpers for switching between lazy/full ZA save/restore routines. + void emitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs, bool IsAgnosticZA) { + if (IsAgnosticZA) + return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/true); + return emitSetupLazySave(MBB, MBBI); + } + void emitZARestore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs, bool IsAgnosticZA) { + if (IsAgnosticZA) + return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/false); + return emitRestoreLazySave(MBB, MBBI, PhysLiveRegs); + } + void emitAllocateZASaveBuffer(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs, bool IsAgnosticZA) { + if (IsAgnosticZA) + return emitAllocateFullZASaveBuffer(MBB, MBBI, PhysLiveRegs); + return emitAllocateLazySaveBuffer(MBB, MBBI); + } /// Save live physical registers to virtual registers. PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB, @@ -228,6 +261,8 @@ struct MachineSMEABI : public MachineFunctionPass { /// Get or create a TPIDR2 block in this function. TPIDR2State getTPIDR2Block(); + Register getAgnosticZABufferPtr(); + private: /// Contains the needed ZA state (and live registers) at an instruction. struct InstInfo { @@ -241,6 +276,7 @@ struct MachineSMEABI : public MachineFunctionPass { struct BlockInfo { ZAState FixedEntryState{ZAState::ANY}; SmallVector Insts; + LiveRegs PhysLiveRegsAtEntry = LiveRegs::None; LiveRegs PhysLiveRegsAtExit = LiveRegs::None; }; @@ -250,6 +286,9 @@ struct MachineSMEABI : public MachineFunctionPass { SmallVector BundleStates; std::optional TPIDR2Block; std::optional AfterSMEProloguePt; + Register AgnosticZABufferPtr = AArch64::NoRegister; + LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None; + bool HasFullZASaveRestore = false; } State; MachineFunction *MF = nullptr; @@ -261,7 +300,8 @@ struct MachineSMEABI : public MachineFunctionPass { }; void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { - assert((SMEFnAttrs.hasZT0State() || SMEFnAttrs.hasZAState()) && + assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() || + SMEFnAttrs.hasZAState()) && "Expected function to have ZA/ZT0 state!"); State.Blocks.resize(MF->getNumBlockIDs()); @@ -295,6 +335,7 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { Block.PhysLiveRegsAtExit = GetPhysLiveRegs(); auto FirstTerminatorInsertPt = MBB.getFirstTerminator(); + auto FirstNonPhiInsertPt = MBB.getFirstNonPHI(); for (MachineInstr &MI : reverse(MBB)) { MachineBasicBlock::iterator MBBI(MI); LiveUnits.stepBackward(MI); @@ -305,7 +346,9 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { // block setup. if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) { State.AfterSMEProloguePt = MBBI; + State.PhysLiveRegsAfterSMEPrologue = PhysLiveRegs; } + // Note: We treat Agnostic ZA as inout_za with an alternate save/restore. auto [NeededState, InsertPt] = getZAStateBeforeInst( *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface()); assert((InsertPt == MBBI || @@ -314,6 +357,8 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { // TODO: Do something to avoid state changes where NZCV is live. if (MBBI == FirstTerminatorInsertPt) Block.PhysLiveRegsAtExit = PhysLiveRegs; + if (MBBI == FirstNonPhiInsertPt) + Block.PhysLiveRegsAtEntry = PhysLiveRegs; if (NeededState != ZAState::ANY) Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs}); } @@ -380,7 +425,7 @@ void MachineSMEABI::assignBundleZAStates() { } } -void MachineSMEABI::insertStateChanges() { +void MachineSMEABI::insertStateChanges(bool IsAgnosticZA) { for (MachineBasicBlock &MBB : *MF) { const BlockInfo &Block = State.Blocks[MBB.getNumber()]; ZAState InState = State.BundleStates[Bundles->getBundle(MBB.getNumber(), @@ -393,7 +438,7 @@ void MachineSMEABI::insertStateChanges() { for (auto &Inst : Block.Insts) { if (CurrentState != Inst.NeededState) emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState, - Inst.PhysLiveRegs); + Inst.PhysLiveRegs, IsAgnosticZA); CurrentState = Inst.NeededState; } @@ -404,7 +449,7 @@ void MachineSMEABI::insertStateChanges() { State.BundleStates[Bundles->getBundle(MBB.getNumber(), /*Out=*/true)]; if (CurrentState != OutState) emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState, - Block.PhysLiveRegsAtExit); + Block.PhysLiveRegsAtExit, IsAgnosticZA); } } @@ -618,10 +663,95 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB, .addImm(1); } +Register MachineSMEABI::getAgnosticZABufferPtr() { + if (State.AgnosticZABufferPtr != AArch64::NoRegister) + return State.AgnosticZABufferPtr; + if (auto BufferPtr = + MF->getInfo()->getEarlyAllocSMESaveBuffer(); + BufferPtr != AArch64::NoRegister) + State.AgnosticZABufferPtr = BufferPtr; + else + State.AgnosticZABufferPtr = + MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + return State.AgnosticZABufferPtr; +} + +void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs, bool IsSave) { + auto *TLI = Subtarget->getTargetLowering(); + State.HasFullZASaveRestore = true; + DebugLoc DL = getDebugLoc(MBB, MBBI); + Register BufferPtr = AArch64::X0; + + PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL); + + // Copy the buffer pointer into X0. + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr) + .addReg(getAgnosticZABufferPtr()); + + // Call __arm_sme_save/__arm_sme_restore. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addReg(BufferPtr, RegState::Implicit) + .addExternalSymbol(TLI->getLibcallName( + IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE)) + .addRegMask(TRI->getCallPreservedMask( + *MF, + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)); + + restorePhyRegSave(RegSave, MBB, MBBI, DL); +} + +void MachineSMEABI::emitAllocateFullZASaveBuffer( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs) { + auto *AFI = MF->getInfo(); + + // Buffer already allocated in SelectionDAG. + if (AFI->getEarlyAllocSMESaveBuffer()) + return; + + DebugLoc DL = getDebugLoc(MBB, MBBI); + Register BufferPtr = getAgnosticZABufferPtr(); + Register BufferSize = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + + PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL); + + // Calculate the SME state size. + { + auto *TLI = Subtarget->getTargetLowering(); + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_SME_STATE_SIZE)) + .addReg(AArch64::X0, RegState::ImplicitDefine) + .addRegMask(TRI->getCallPreservedMask( + *MF, CallingConv:: + AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferSize) + .addReg(AArch64::X0); + } + + // Allocate a buffer object of the size given __arm_sme_state_size. + { + MachineFrameInfo &MFI = MF->getFrameInfo(); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP) + .addReg(AArch64::SP) + .addReg(BufferSize) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr) + .addReg(AArch64::SP); + + // We have just allocated a variable sized object, tell this to PEI. + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + + restorePhyRegSave(RegSave, MBB, MBBI, DL); +} + void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, ZAState From, ZAState To, - LiveRegs PhysLiveRegs) { + LiveRegs PhysLiveRegs, bool IsAgnosticZA) { // ZA not used. if (From == ZAState::ANY || To == ZAState::ANY) @@ -653,12 +783,13 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB, } if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED) - emitSetupLazySave(MBB, InsertPt); + emitZASave(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA); else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE) - emitRestoreLazySave(MBB, InsertPt, PhysLiveRegs); + emitZARestore(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA); else if (To == ZAState::OFF) { assert(From != ZAState::CALLER_DORMANT && "CALLER_DORMANT to OFF should have already been handled"); + assert(!IsAgnosticZA && "Should not turn ZA off in agnostic ZA function"); emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED); } else { dbgs() << "Error: Transition from " << getZAStateString(From) << " to " @@ -678,7 +809,8 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { auto *AFI = MF.getInfo(); SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs(); - if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State()) + if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() && + !SMEFnAttrs.hasAgnosticZAInterface()) return false; assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!"); @@ -692,20 +824,27 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { TRI = Subtarget->getRegisterInfo(); MRI = &MF.getRegInfo(); + bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface(); + collectNeededZAStates(SMEFnAttrs); assignBundleZAStates(); - insertStateChanges(); + insertStateChanges(/*IsAgnosticZA=*/IsAgnosticZA); // Allocate save buffer (if needed). - if (State.TPIDR2Block) { + if (State.HasFullZASaveRestore || State.TPIDR2Block) { if (State.AfterSMEProloguePt) { // Note: With inline stack probes the AfterSMEProloguePt may not be in the // entry block (due to the probing loop). - emitAllocateLazySaveBuffer(*(*State.AfterSMEProloguePt)->getParent(), - *State.AfterSMEProloguePt); + emitAllocateZASaveBuffer(*(*State.AfterSMEProloguePt)->getParent(), + *State.AfterSMEProloguePt, + State.PhysLiveRegsAfterSMEPrologue, + /*IsAgnosticZA=*/IsAgnosticZA); } else { MachineBasicBlock &EntryBlock = MF.front(); - emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI()); + emitAllocateZASaveBuffer( + EntryBlock, EntryBlock.getFirstNonPHI(), + State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry, + /*IsAgnosticZA=*/IsAgnosticZA); } } diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index 25a7b87d37d9e..cf294ca0ead7e 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sme2 < %s | FileCheck %s -; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s +; RUN: llc -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK +; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING target triple = "aarch64" @@ -9,10 +9,10 @@ declare i64 @agnostic_decl(i64) "aarch64_za_state_agnostic" ; No calls. Test that no buffer is allocated. define i64 @agnostic_caller_no_callees(ptr %ptr) nounwind "aarch64_za_state_agnostic" { -; CHECK-LABEL: agnostic_caller_no_callees: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr x0, [x0] -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: agnostic_caller_no_callees: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: ldr x0, [x0] +; CHECK-COMMON-NEXT: ret %v = load i64, ptr %ptr ret i64 %v } @@ -51,6 +51,29 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: agnostic_caller_private_za_callee: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size +; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0 +; CHECK-NEWLOWERING-NEXT: mov x19, sp +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret %res = call i64 @private_za_decl(i64 %v) %res2 = call i64 @private_za_decl(i64 %res) ret i64 %res2 @@ -60,12 +83,12 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state ; ; Should not result in save/restore code. define i64 @agnostic_caller_agnostic_callee(i64 %v) nounwind "aarch64_za_state_agnostic" { -; CHECK-LABEL: agnostic_caller_agnostic_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl agnostic_decl -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: agnostic_caller_agnostic_callee: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: bl agnostic_decl +; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret %res = call i64 @agnostic_decl(i64 %v) ret i64 %res } @@ -74,12 +97,12 @@ define i64 @agnostic_caller_agnostic_callee(i64 %v) nounwind "aarch64_za_state_a ; ; Should not result in lazy-save or save of ZT0 define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" "aarch64_inout_zt0" { -; CHECK-LABEL: shared_caller_agnostic_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl agnostic_decl -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: shared_caller_agnostic_callee: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: bl agnostic_decl +; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret %res = call i64 @agnostic_decl(i64 %v) ret i64 %res } @@ -126,6 +149,41 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: add x29, sp, #64 +; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size +; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0 +; CHECK-NEWLOWERING-NEXT: mov x20, sp +; CHECK-NEWLOWERING-NEXT: mov x0, x20 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save +; CHECK-NEWLOWERING-NEXT: smstop sm +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: smstart sm +; CHECK-NEWLOWERING-NEXT: smstop sm +; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: smstart sm +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x0, x20 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 +; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret %res = call i64 @private_za_decl(i64 %v) %res2 = call i64 @private_za_decl(i64 %res) ret i64 %res2 @@ -187,6 +245,55 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: add x29, sp, #64 +; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size +; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0 +; CHECK-NEWLOWERING-NEXT: mov x19, sp +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state +; CHECK-NEWLOWERING-NEXT: mov x20, x0 +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save +; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: smstop sm +; CHECK-NEWLOWERING-NEXT: .LBB5_2: +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4 +; CHECK-NEWLOWERING-NEXT: // %bb.3: +; CHECK-NEWLOWERING-NEXT: smstart sm +; CHECK-NEWLOWERING-NEXT: .LBB5_4: +; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_6 +; CHECK-NEWLOWERING-NEXT: // %bb.5: +; CHECK-NEWLOWERING-NEXT: smstop sm +; CHECK-NEWLOWERING-NEXT: .LBB5_6: +; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_8 +; CHECK-NEWLOWERING-NEXT: // %bb.7: +; CHECK-NEWLOWERING-NEXT: smstart sm +; CHECK-NEWLOWERING-NEXT: .LBB5_8: +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 +; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret %res = call i64 @private_za_decl(i64 %v) %res2 = call i64 @private_za_decl(i64 %res) ret i64 %res2 @@ -223,6 +330,31 @@ define i64 @test_many_callee_arguments( ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: test_many_callee_arguments: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size +; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0 +; CHECK-NEWLOWERING-NEXT: mov x19, sp +; CHECK-NEWLOWERING-NEXT: ldp x9, x10, [x29, #32] +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save +; CHECK-NEWLOWERING-NEXT: stp x9, x10, [sp, #-16]! +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee +; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9 ) nounwind "aarch64_za_state_agnostic" { %ret = call i64 @many_args_private_za_callee(