-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[AArch64][SME] Avoid ZA save state changes in loops in MachineSMEABIPass #149065
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/MacDue/agnostic-ZA
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis patch uses the MachineLoopInfo to give blocks within loops a higher weight when choosing the bundle ZA state. MachineLoopInfo does not find loop trip counts, so this uses an arbitrary weight (default 10), which can be configured with the This makes the MachineSMEABIPass pass more likely to pick a bundle state that matches the loop's entry/exit state, which avoids state changes in the loop (which we assume will happen more than once). This does require some extra analysis, so this is only enabled at -O1 and above. Full diff: https://github.com/llvm/llvm-project/pull/149065.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 8d0ff41fc8c08..139684172f1bb 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -60,7 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
FunctionPass *createAArch64CollectLOHPass();
FunctionPass *createSMEABIPass();
FunctionPass *createSMEPeepholeOptPass();
-FunctionPass *createMachineSMEABIPass();
+FunctionPass *createMachineSMEABIPass(CodeGenOptLevel);
ModulePass *createSVEIntrinsicOptsPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 2c1edecd0b48d..b26a137d4e0fb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -792,7 +792,7 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
void AArch64PassConfig::addMachineSSAOptimization() {
if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None)
- addPass(createMachineSMEABIPass());
+ addPass(createMachineSMEABIPass(TM->getOptLevel()));
if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt)
addPass(createSMEPeepholeOptPass());
@@ -825,7 +825,7 @@ bool AArch64PassConfig::addILPOpts() {
void AArch64PassConfig::addPreRegAlloc() {
if (EnableNewSMEABILowering && TM->getOptLevel() == CodeGenOptLevel::None)
- addPass(createMachineSMEABIPass());
+ addPass(createMachineSMEABIPass(CodeGenOptLevel::None));
// Change dead register definitions to refer to the zero register.
if (TM->getOptLevel() != CodeGenOptLevel::None &&
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 7c0cad299cc64..f63a338b4bd23 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -28,6 +29,12 @@ using namespace llvm;
#define DEBUG_TYPE "aarch64-machine-sme-abi"
+static cl::opt<int>
+ LoopEdgeWeight("aarch64-sme-abi-loop-edge-weight", cl::ReallyHidden,
+ cl::init(10),
+ cl::desc("Edge weight for basic blocks witin loops (used "
+ "for placing ZA saves/restores)"));
+
namespace {
enum ZAState {
@@ -112,7 +119,8 @@ getInstNeededZAState(const TargetRegisterInfo &TRI, MachineInstr &MI,
struct MachineSMEABI : public MachineFunctionPass {
inline static char ID = 0;
- MachineSMEABI() : MachineFunctionPass(ID) {}
+ MachineSMEABI(CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
+ : MachineFunctionPass(ID), OptLevel(OptLevel) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -121,6 +129,9 @@ struct MachineSMEABI : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<EdgeBundlesWrapperLegacy>();
+ // Only analyse loops at -01 and above.
+ if (OptLevel != CodeGenOptLevel::None)
+ AU.addRequired<MachineLoopInfoWrapperPass>();
AU.addPreservedID(MachineLoopInfoID);
AU.addPreservedID(MachineDominatorsID);
MachineFunctionPass::getAnalysisUsage(AU);
@@ -197,6 +208,8 @@ struct MachineSMEABI : public MachineFunctionPass {
LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
};
+ CodeGenOptLevel OptLevel = CodeGenOptLevel::Default;
+
// All pass state that must be cleared between functions.
struct PassState {
SmallVector<BlockInfo> Blocks;
@@ -209,6 +222,7 @@ struct MachineSMEABI : public MachineFunctionPass {
} State;
EdgeBundles *Bundles = nullptr;
+ MachineLoopInfo *MLI = nullptr;
};
void MachineSMEABI::collectNeededZAStates(MachineFunction &MF,
@@ -302,18 +316,23 @@ void MachineSMEABI::pickBundleZAStates(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << " (no state preference)\n");
continue;
}
+ bool IsLoop = MLI && MLI->getLoopFor(MF.getBlockNumbered(BlockID));
bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I;
bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I;
+ int EdgeWeight = IsLoop ? LoopEdgeWeight : 1;
+ if (IsLoop)
+ LLVM_DEBUG(dbgs() << " IsLoop");
+ LLVM_DEBUG(dbgs() << " (EdgeWeight: " << EdgeWeight << ')');
ZAState DesiredIncomingState = Block.Insts.front().NeededState;
if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
- EdgeStateCounts[DesiredIncomingState]++;
+ EdgeStateCounts[DesiredIncomingState] += EdgeWeight;
LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
<< getZAStateString(DesiredIncomingState));
}
ZAState DesiredOutgoingState = Block.Insts.front().NeededState;
if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
- EdgeStateCounts[DesiredOutgoingState]++;
+ EdgeStateCounts[DesiredOutgoingState] += EdgeWeight;
LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
<< getZAStateString(DesiredOutgoingState));
}
@@ -771,6 +790,8 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
// Reset pass state.
State = PassState{};
Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
+ if (OptLevel != CodeGenOptLevel::None)
+ MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
@@ -799,4 +820,6 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
return true;
}
-FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
+FunctionPass *llvm::createMachineSMEABIPass(CodeGenOptLevel OptLevel) {
+ return new MachineSMEABI(OptLevel);
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll
new file mode 100644
index 0000000000000..200280f52acb0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O0
+; RUN: llc -O1 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O1
+
+declare void @private_za_call()
+declare void @shared_za_call() "aarch64_inout_za"
+
+; This test checks that at -O0 we don't attempt to optimize lazy save state
+; changes in loops, and that -O1 (and above) we attempt to push state changes
+; out of loops.
+
+define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind {
+; CHECK-O0-LABEL: private_za_loop_active_entry_and_exit:
+; CHECK-O0: // %bb.0: // %entry
+; CHECK-O0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-O0-NEXT: mov x29, sp
+; CHECK-O0-NEXT: sub sp, sp, #32
+; CHECK-O0-NEXT: rdsvl x9, #1
+; CHECK-O0-NEXT: mov x8, sp
+; CHECK-O0-NEXT: msub x8, x9, x9, x8
+; CHECK-O0-NEXT: mov sp, x8
+; CHECK-O0-NEXT: stp x8, x9, [x29, #-16]
+; CHECK-O0-NEXT: stur w0, [x29, #-24] // 4-byte Folded Spill
+; CHECK-O0-NEXT: bl shared_za_call
+; CHECK-O0-NEXT: ldur w0, [x29, #-24] // 4-byte Folded Reload
+; CHECK-O0-NEXT: mov w8, wzr
+; CHECK-O0-NEXT: subs w9, w0, #1
+; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill
+; CHECK-O0-NEXT: b.lt .LBB0_4
+; CHECK-O0-NEXT: b .LBB0_1
+; CHECK-O0-NEXT: .LBB0_1: // %loop
+; CHECK-O0-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-O0-NEXT: ldur w8, [x29, #-20] // 4-byte Folded Reload
+; CHECK-O0-NEXT: stur w8, [x29, #-28] // 4-byte Folded Spill
+; CHECK-O0-NEXT: sub x8, x29, #16
+; CHECK-O0-NEXT: msr TPIDR2_EL0, x8
+; CHECK-O0-NEXT: bl private_za_call
+; CHECK-O0-NEXT: ldur w8, [x29, #-28] // 4-byte Folded Reload
+; CHECK-O0-NEXT: ldur w10, [x29, #-24] // 4-byte Folded Reload
+; CHECK-O0-NEXT: add w9, w8, #1
+; CHECK-O0-NEXT: mov w8, w9
+; CHECK-O0-NEXT: subs w9, w9, w10
+; CHECK-O0-NEXT: mrs x9, NZCV
+; CHECK-O0-NEXT: smstart za
+; CHECK-O0-NEXT: mrs x10, TPIDR2_EL0
+; CHECK-O0-NEXT: sub x0, x29, #16
+; CHECK-O0-NEXT: cbz x10, .LBB0_2
+; CHECK-O0-NEXT: b .LBB0_3
+; CHECK-O0-NEXT: .LBB0_2: // %loop
+; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1
+; CHECK-O0-NEXT: bl __arm_tpidr2_restore
+; CHECK-O0-NEXT: b .LBB0_3
+; CHECK-O0-NEXT: .LBB0_3: // %loop
+; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1
+; CHECK-O0-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-O0-NEXT: msr NZCV, x9
+; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill
+; CHECK-O0-NEXT: b.ne .LBB0_1
+; CHECK-O0-NEXT: b .LBB0_4
+; CHECK-O0-NEXT: .LBB0_4: // %exit
+; CHECK-O0-NEXT: mov sp, x29
+; CHECK-O0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-O0-NEXT: b shared_za_call
+;
+; CHECK-O1-LABEL: private_za_loop_active_entry_and_exit:
+; CHECK-O1: // %bb.0: // %entry
+; CHECK-O1-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-O1-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-O1-NEXT: mov x29, sp
+; CHECK-O1-NEXT: sub sp, sp, #16
+; CHECK-O1-NEXT: rdsvl x8, #1
+; CHECK-O1-NEXT: mov x9, sp
+; CHECK-O1-NEXT: msub x9, x8, x8, x9
+; CHECK-O1-NEXT: mov sp, x9
+; CHECK-O1-NEXT: mov w19, w0
+; CHECK-O1-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-O1-NEXT: bl shared_za_call
+; CHECK-O1-NEXT: cmp w19, #1
+; CHECK-O1-NEXT: sub x8, x29, #16
+; CHECK-O1-NEXT: msr TPIDR2_EL0, x8
+; CHECK-O1-NEXT: b.lt .LBB0_2
+; CHECK-O1-NEXT: .LBB0_1: // %loop
+; CHECK-O1-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-O1-NEXT: bl private_za_call
+; CHECK-O1-NEXT: subs w19, w19, #1
+; CHECK-O1-NEXT: b.ne .LBB0_1
+; CHECK-O1-NEXT: .LBB0_2: // %exit
+; CHECK-O1-NEXT: smstart za
+; CHECK-O1-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-O1-NEXT: sub x0, x29, #16
+; CHECK-O1-NEXT: cbnz x8, .LBB0_4
+; CHECK-O1-NEXT: // %bb.3: // %exit
+; CHECK-O1-NEXT: bl __arm_tpidr2_restore
+; CHECK-O1-NEXT: .LBB0_4: // %exit
+; CHECK-O1-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-O1-NEXT: mov sp, x29
+; CHECK-O1-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-O1-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-O1-NEXT: b shared_za_call
+entry:
+ %cmpgt = icmp sgt i32 %n, 0
+ tail call void @shared_za_call()
+ br i1 %cmpgt, label %loop, label %exit
+
+loop:
+ %iv = phi i32 [ %next_iv, %loop ], [ 0, %entry ]
+ tail call void @private_za_call()
+ %next_iv = add nuw nsw i32 %iv, 1
+ %cmpeq = icmp eq i32 %next_iv, %n
+ br i1 %cmpeq, label %exit, label %loop
+
+exit:
+ tail call void @shared_za_call()
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
index d3d7e953bedfa..e9ef9d22aaba5 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
@@ -102,7 +102,7 @@ exit:
ret void
}
-; FIXME: In the new lowering we could weight edges to avoid doing the lazy save in the loop.
+; This tests that with the new lowering we push state changes out of loops (at -O1 and above).
define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind {
; CHECK-LABEL: private_za_loop_active_entry_and_exit:
; CHECK: // %bb.0: // %entry
@@ -154,7 +154,7 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
; CHECK-NEWLOWERING-LABEL: private_za_loop_active_entry_and_exit:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: mov x29, sp
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
@@ -165,30 +165,25 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEWLOWERING-NEXT: bl shared_za_call
; CHECK-NEWLOWERING-NEXT: cmp w19, #1
-; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_5
-; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader
-; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16
-; CHECK-NEWLOWERING-NEXT: b .LBB1_3
-; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %loop
-; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: cbz w19, .LBB1_5
-; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %loop
+; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
+; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_2
+; CHECK-NEWLOWERING-NEXT: .LBB1_1: // %loop
; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: sub w19, w19, #1
+; CHECK-NEWLOWERING-NEXT: subs w19, w19, #1
+; CHECK-NEWLOWERING-NEXT: b.ne .LBB1_1
+; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %exit
; CHECK-NEWLOWERING-NEXT: smstart za
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2
-; CHECK-NEWLOWERING-NEXT: // %bb.4: // %loop
-; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1
+; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_4
+; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: b .LBB1_2
-; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit
+; CHECK-NEWLOWERING-NEXT: .LBB1_4: // %exit
+; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: b shared_za_call
entry:
|
Note: This patch is a minor improvement to placing saves/restores. For more complex programs, we will need to propagate required ZA states through blocks with "no preference" to make better decisions. |
3fb2e45
to
af4a764
Compare
cab3409
to
2c9e14c
Compare
af4a764
to
323b821
Compare
2c9e14c
to
f182168
Compare
323b821
to
75b2bf0
Compare
f182168
to
88c0bb6
Compare
This patch uses the MachineLoopInfo to give blocks within loops a higher weight when choosing the bundle ZA state. MachineLoopInfo does not find loop trip counts, so this uses an arbitrary weight (default 10), which can be configured with the `-aarch64-sme-abi-loop-edge-weight` flag. This makes the MachineSMEABIPass pass more likely to pick a bundle state that matches the loop's entry/exit state, which avoids state changes in the loop (which we assume will happen more than once). This does require some extra analysis, so this is only enabled at -O1 and above. Change-Id: If318c809d2f7cc1fca144fbe424ba2a2ca7fb19f
75b2bf0
to
ea77b25
Compare
88c0bb6
to
2d5441c
Compare
This patch uses the MachineLoopInfo to give blocks within loops a higher weight when choosing the bundle ZA state. MachineLoopInfo does not find loop trip counts, so this uses an arbitrary weight (default 10), which can be configured with the
-aarch64-sme-abi-loop-edge-weight
flag.This makes the MachineSMEABIPass pass more likely to pick a bundle state that matches the loop's entry/exit state, which avoids state changes in the loop (which we assume will happen more than once).
This does require some extra analysis, so this is only enabled at -O1 and above.