Skip to content

Commit 2c9e14c

Browse files
committed
[AArch64][SME] Avoid ZA save state changes in loops in MachineSMEABIPass
This patch uses the MachineLoopInfo to give blocks within loops a higher weight when choosing the bundle ZA state. MachineLoopInfo does not find loop trip counts, so this uses an arbitrary weight (default 10), which can be configured with the `-aarch64-sme-abi-loop-edge-weight` flag. This makes the MachineSMEABIPass pass more likely to pick a bundle state that matches the loop's entry/exit state, which avoids state changes in the loop (which we assume will happen more than once). This does require some extra analysis, so this is only enabled at -O1 and above. Change-Id: If318c809d2f7cc1fca144fbe424ba2a2ca7fb19f
1 parent af4a764 commit 2c9e14c

File tree

5 files changed

+159
-26
lines changed

5 files changed

+159
-26
lines changed

llvm/lib/Target/AArch64/AArch64.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
6060
FunctionPass *createAArch64CollectLOHPass();
6161
FunctionPass *createSMEABIPass();
6262
FunctionPass *createSMEPeepholeOptPass();
63-
FunctionPass *createMachineSMEABIPass();
63+
FunctionPass *createMachineSMEABIPass(CodeGenOptLevel);
6464
ModulePass *createSVEIntrinsicOptsPass();
6565
InstructionSelector *
6666
createAArch64InstructionSelector(const AArch64TargetMachine &,

llvm/lib/Target/AArch64/AArch64TargetMachine.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -792,7 +792,7 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
792792

793793
void AArch64PassConfig::addMachineSSAOptimization() {
794794
if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None)
795-
addPass(createMachineSMEABIPass());
795+
addPass(createMachineSMEABIPass(TM->getOptLevel()));
796796

797797
if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt)
798798
addPass(createSMEPeepholeOptPass());
@@ -825,7 +825,7 @@ bool AArch64PassConfig::addILPOpts() {
825825

826826
void AArch64PassConfig::addPreRegAlloc() {
827827
if (EnableNewSMEABILowering && TM->getOptLevel() == CodeGenOptLevel::None)
828-
addPass(createMachineSMEABIPass());
828+
addPass(createMachineSMEABIPass(CodeGenOptLevel::None));
829829

830830
// Change dead register definitions to refer to the zero register.
831831
if (TM->getOptLevel() != CodeGenOptLevel::None &&

llvm/lib/Target/AArch64/MachineSMEABIPass.cpp

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,20 @@
2121
#include "llvm/CodeGen/LivePhysRegs.h"
2222
#include "llvm/CodeGen/MachineBasicBlock.h"
2323
#include "llvm/CodeGen/MachineFunctionPass.h"
24+
#include "llvm/CodeGen/MachineLoopInfo.h"
2425
#include "llvm/CodeGen/MachineRegisterInfo.h"
2526
#include "llvm/CodeGen/TargetRegisterInfo.h"
2627

2728
using namespace llvm;
2829

2930
#define DEBUG_TYPE "aarch64-machine-sme-abi"
3031

32+
static cl::opt<int>
33+
LoopEdgeWeight("aarch64-sme-abi-loop-edge-weight", cl::ReallyHidden,
34+
cl::init(10),
35+
cl::desc("Edge weight for basic blocks witin loops (used "
36+
"for placing ZA saves/restores)"));
37+
3138
namespace {
3239

3340
enum ZAState {
@@ -112,7 +119,8 @@ getInstNeededZAState(const TargetRegisterInfo &TRI, MachineInstr &MI,
112119
struct MachineSMEABI : public MachineFunctionPass {
113120
inline static char ID = 0;
114121

115-
MachineSMEABI() : MachineFunctionPass(ID) {}
122+
MachineSMEABI(CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
123+
: MachineFunctionPass(ID), OptLevel(OptLevel) {}
116124

117125
bool runOnMachineFunction(MachineFunction &MF) override;
118126

@@ -121,6 +129,9 @@ struct MachineSMEABI : public MachineFunctionPass {
121129
void getAnalysisUsage(AnalysisUsage &AU) const override {
122130
AU.setPreservesCFG();
123131
AU.addRequired<EdgeBundlesWrapperLegacy>();
132+
// Only analyse loops at -01 and above.
133+
if (OptLevel != CodeGenOptLevel::None)
134+
AU.addRequired<MachineLoopInfoWrapperPass>();
124135
AU.addPreservedID(MachineLoopInfoID);
125136
AU.addPreservedID(MachineDominatorsID);
126137
MachineFunctionPass::getAnalysisUsage(AU);
@@ -197,6 +208,8 @@ struct MachineSMEABI : public MachineFunctionPass {
197208
LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
198209
};
199210

211+
CodeGenOptLevel OptLevel = CodeGenOptLevel::Default;
212+
200213
// All pass state that must be cleared between functions.
201214
struct PassState {
202215
SmallVector<BlockInfo> Blocks;
@@ -209,6 +222,7 @@ struct MachineSMEABI : public MachineFunctionPass {
209222
} State;
210223

211224
EdgeBundles *Bundles = nullptr;
225+
MachineLoopInfo *MLI = nullptr;
212226
};
213227

214228
void MachineSMEABI::collectNeededZAStates(MachineFunction &MF,
@@ -302,18 +316,23 @@ void MachineSMEABI::pickBundleZAStates(MachineFunction &MF) {
302316
LLVM_DEBUG(dbgs() << " (no state preference)\n");
303317
continue;
304318
}
319+
bool IsLoop = MLI && MLI->getLoopFor(MF.getBlockNumbered(BlockID));
305320
bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I;
306321
bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I;
322+
int EdgeWeight = IsLoop ? LoopEdgeWeight : 1;
323+
if (IsLoop)
324+
LLVM_DEBUG(dbgs() << " IsLoop");
307325

326+
LLVM_DEBUG(dbgs() << " (EdgeWeight: " << EdgeWeight << ')');
308327
ZAState DesiredIncomingState = Block.Insts.front().NeededState;
309328
if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
310-
EdgeStateCounts[DesiredIncomingState]++;
329+
EdgeStateCounts[DesiredIncomingState] += EdgeWeight;
311330
LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
312331
<< getZAStateString(DesiredIncomingState));
313332
}
314333
ZAState DesiredOutgoingState = Block.Insts.back().NeededState;
315334
if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
316-
EdgeStateCounts[DesiredOutgoingState]++;
335+
EdgeStateCounts[DesiredOutgoingState] += EdgeWeight;
317336
LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
318337
<< getZAStateString(DesiredOutgoingState));
319338
}
@@ -771,6 +790,8 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
771790
// Reset pass state.
772791
State = PassState{};
773792
Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
793+
if (OptLevel != CodeGenOptLevel::None)
794+
MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
774795

775796
bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
776797

@@ -799,4 +820,6 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
799820
return true;
800821
}
801822

802-
FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
823+
FunctionPass *llvm::createMachineSMEABIPass(CodeGenOptLevel OptLevel) {
824+
return new MachineSMEABI(OptLevel);
825+
}
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O0
3+
; RUN: llc -O1 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O1
4+
5+
declare void @private_za_call()
6+
declare void @shared_za_call() "aarch64_inout_za"
7+
8+
; This test checks that at -O0 we don't attempt to optimize lazy save state
9+
; changes in loops, and that -O1 (and above) we attempt to push state changes
10+
; out of loops.
11+
12+
define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind {
13+
; CHECK-O0-LABEL: private_za_loop_active_entry_and_exit:
14+
; CHECK-O0: // %bb.0: // %entry
15+
; CHECK-O0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
16+
; CHECK-O0-NEXT: mov x29, sp
17+
; CHECK-O0-NEXT: sub sp, sp, #32
18+
; CHECK-O0-NEXT: rdsvl x9, #1
19+
; CHECK-O0-NEXT: mov x8, sp
20+
; CHECK-O0-NEXT: msub x8, x9, x9, x8
21+
; CHECK-O0-NEXT: mov sp, x8
22+
; CHECK-O0-NEXT: stp x8, x9, [x29, #-16]
23+
; CHECK-O0-NEXT: stur w0, [x29, #-24] // 4-byte Folded Spill
24+
; CHECK-O0-NEXT: bl shared_za_call
25+
; CHECK-O0-NEXT: ldur w0, [x29, #-24] // 4-byte Folded Reload
26+
; CHECK-O0-NEXT: mov w8, wzr
27+
; CHECK-O0-NEXT: subs w9, w0, #1
28+
; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill
29+
; CHECK-O0-NEXT: b.lt .LBB0_4
30+
; CHECK-O0-NEXT: b .LBB0_1
31+
; CHECK-O0-NEXT: .LBB0_1: // %loop
32+
; CHECK-O0-NEXT: // =>This Inner Loop Header: Depth=1
33+
; CHECK-O0-NEXT: ldur w8, [x29, #-20] // 4-byte Folded Reload
34+
; CHECK-O0-NEXT: stur w8, [x29, #-28] // 4-byte Folded Spill
35+
; CHECK-O0-NEXT: sub x8, x29, #16
36+
; CHECK-O0-NEXT: msr TPIDR2_EL0, x8
37+
; CHECK-O0-NEXT: bl private_za_call
38+
; CHECK-O0-NEXT: ldur w8, [x29, #-28] // 4-byte Folded Reload
39+
; CHECK-O0-NEXT: ldur w10, [x29, #-24] // 4-byte Folded Reload
40+
; CHECK-O0-NEXT: add w9, w8, #1
41+
; CHECK-O0-NEXT: mov w8, w9
42+
; CHECK-O0-NEXT: subs w9, w9, w10
43+
; CHECK-O0-NEXT: mrs x9, NZCV
44+
; CHECK-O0-NEXT: smstart za
45+
; CHECK-O0-NEXT: mrs x10, TPIDR2_EL0
46+
; CHECK-O0-NEXT: sub x0, x29, #16
47+
; CHECK-O0-NEXT: cbz x10, .LBB0_2
48+
; CHECK-O0-NEXT: b .LBB0_3
49+
; CHECK-O0-NEXT: .LBB0_2: // %loop
50+
; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1
51+
; CHECK-O0-NEXT: bl __arm_tpidr2_restore
52+
; CHECK-O0-NEXT: b .LBB0_3
53+
; CHECK-O0-NEXT: .LBB0_3: // %loop
54+
; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1
55+
; CHECK-O0-NEXT: msr TPIDR2_EL0, xzr
56+
; CHECK-O0-NEXT: msr NZCV, x9
57+
; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill
58+
; CHECK-O0-NEXT: b.ne .LBB0_1
59+
; CHECK-O0-NEXT: b .LBB0_4
60+
; CHECK-O0-NEXT: .LBB0_4: // %exit
61+
; CHECK-O0-NEXT: mov sp, x29
62+
; CHECK-O0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
63+
; CHECK-O0-NEXT: b shared_za_call
64+
;
65+
; CHECK-O1-LABEL: private_za_loop_active_entry_and_exit:
66+
; CHECK-O1: // %bb.0: // %entry
67+
; CHECK-O1-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
68+
; CHECK-O1-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
69+
; CHECK-O1-NEXT: mov x29, sp
70+
; CHECK-O1-NEXT: sub sp, sp, #16
71+
; CHECK-O1-NEXT: rdsvl x8, #1
72+
; CHECK-O1-NEXT: mov x9, sp
73+
; CHECK-O1-NEXT: msub x9, x8, x8, x9
74+
; CHECK-O1-NEXT: mov sp, x9
75+
; CHECK-O1-NEXT: mov w19, w0
76+
; CHECK-O1-NEXT: stp x9, x8, [x29, #-16]
77+
; CHECK-O1-NEXT: bl shared_za_call
78+
; CHECK-O1-NEXT: cmp w19, #1
79+
; CHECK-O1-NEXT: sub x8, x29, #16
80+
; CHECK-O1-NEXT: msr TPIDR2_EL0, x8
81+
; CHECK-O1-NEXT: b.lt .LBB0_2
82+
; CHECK-O1-NEXT: .LBB0_1: // %loop
83+
; CHECK-O1-NEXT: // =>This Inner Loop Header: Depth=1
84+
; CHECK-O1-NEXT: bl private_za_call
85+
; CHECK-O1-NEXT: subs w19, w19, #1
86+
; CHECK-O1-NEXT: b.ne .LBB0_1
87+
; CHECK-O1-NEXT: .LBB0_2: // %exit
88+
; CHECK-O1-NEXT: smstart za
89+
; CHECK-O1-NEXT: mrs x8, TPIDR2_EL0
90+
; CHECK-O1-NEXT: sub x0, x29, #16
91+
; CHECK-O1-NEXT: cbnz x8, .LBB0_4
92+
; CHECK-O1-NEXT: // %bb.3: // %exit
93+
; CHECK-O1-NEXT: bl __arm_tpidr2_restore
94+
; CHECK-O1-NEXT: .LBB0_4: // %exit
95+
; CHECK-O1-NEXT: msr TPIDR2_EL0, xzr
96+
; CHECK-O1-NEXT: mov sp, x29
97+
; CHECK-O1-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
98+
; CHECK-O1-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
99+
; CHECK-O1-NEXT: b shared_za_call
100+
entry:
101+
%cmpgt = icmp sgt i32 %n, 0
102+
tail call void @shared_za_call()
103+
br i1 %cmpgt, label %loop, label %exit
104+
105+
loop:
106+
%iv = phi i32 [ %next_iv, %loop ], [ 0, %entry ]
107+
tail call void @private_za_call()
108+
%next_iv = add nuw nsw i32 %iv, 1
109+
%cmpeq = icmp eq i32 %next_iv, %n
110+
br i1 %cmpeq, label %exit, label %loop
111+
112+
exit:
113+
tail call void @shared_za_call()
114+
ret void
115+
}

llvm/test/CodeGen/AArch64/sme-za-control-flow.ll

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ exit:
102102
ret void
103103
}
104104

105-
; FIXME: In the new lowering we could weight edges to avoid doing the lazy save in the loop.
105+
; This tests that with the new lowering we push state changes out of loops (at -O1 and above).
106106
define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind {
107107
; CHECK-LABEL: private_za_loop_active_entry_and_exit:
108108
; CHECK: // %bb.0: // %entry
@@ -154,7 +154,7 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
154154
; CHECK-NEWLOWERING-LABEL: private_za_loop_active_entry_and_exit:
155155
; CHECK-NEWLOWERING: // %bb.0: // %entry
156156
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
157-
; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
157+
; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
158158
; CHECK-NEWLOWERING-NEXT: mov x29, sp
159159
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
160160
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
@@ -165,30 +165,25 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
165165
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
166166
; CHECK-NEWLOWERING-NEXT: bl shared_za_call
167167
; CHECK-NEWLOWERING-NEXT: cmp w19, #1
168-
; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_5
169-
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader
170-
; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16
171-
; CHECK-NEWLOWERING-NEXT: b .LBB1_3
172-
; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %loop
173-
; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1
174-
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
175-
; CHECK-NEWLOWERING-NEXT: cbz w19, .LBB1_5
176-
; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %loop
168+
; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
169+
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
170+
; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_2
171+
; CHECK-NEWLOWERING-NEXT: .LBB1_1: // %loop
177172
; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1
178-
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20
179173
; CHECK-NEWLOWERING-NEXT: bl private_za_call
180-
; CHECK-NEWLOWERING-NEXT: sub w19, w19, #1
174+
; CHECK-NEWLOWERING-NEXT: subs w19, w19, #1
175+
; CHECK-NEWLOWERING-NEXT: b.ne .LBB1_1
176+
; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %exit
181177
; CHECK-NEWLOWERING-NEXT: smstart za
182178
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
183179
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
184-
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2
185-
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %loop
186-
; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1
180+
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_4
181+
; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit
187182
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
188-
; CHECK-NEWLOWERING-NEXT: b .LBB1_2
189-
; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit
183+
; CHECK-NEWLOWERING-NEXT: .LBB1_4: // %exit
184+
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
190185
; CHECK-NEWLOWERING-NEXT: mov sp, x29
191-
; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
186+
; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
192187
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
193188
; CHECK-NEWLOWERING-NEXT: b shared_za_call
194189
entry:

0 commit comments

Comments
 (0)