Skip to content

Commit a0dfafc

Browse files
committed
[AArch64] Runtime-unroll small load/store loops for Apple Silicon CPUs.
Add initial heuristics to selectively enable runtime unrolling for loops where doing so is expected to be highly beneficial on Apple Silicon CPUs. To start with, we try to runtime-unroll small, single block loops, if they have load/store dependencies, to expose more parallel memory access streams [1] and to improve instruction delivery [2]. We also explicitly avoid runtime-unrolling for loop structures that may limit the expected gains from runtime unrolling. Such loops include loops with complex control flow (aren't innermost loops, have multiple exits, have a large number of blocks), trip count expansion is expensive and are expected to execute a small number of iterations. Note that the heuristics here may be overly conservative and we err on the side of avoiding runtime unrolling rather than unroll excessively. They are all subject to further refinement. [1] 4.6.10 in Apple Silicon CPU Optimization Guide [2] 4.4.4 in Apple Silicon CPU Optimization Guide (Depends on #118316 for TTI changes, which are included in this PR for now)
1 parent a5eb32b commit a0dfafc

File tree

2 files changed

+168
-4
lines changed

2 files changed

+168
-4
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3989,6 +3989,90 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
39893989
}
39903990
}
39913991

3992+
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
3993+
/// OOO engine's wide instruction window and various predictors.
3994+
static void
3995+
getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
3996+
TargetTransformInfo::UnrollingPreferences &UP,
3997+
AArch64TTIImpl &TTI) {
3998+
// Limit loops with structure that is highly likely to benefit from runtime
3999+
// unrolling; that is we exclude outer loops, loops with multiple exits and
4000+
// many blocks (i.e. likely with complex control flow). Note that the
4001+
// heuristics here may be overly conservative and we err on the side of
4002+
// avoiding runtime unrolling rather than unroll excessively. They are all
4003+
// subject to further refinement.
4004+
if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
4005+
return;
4006+
4007+
const SCEV *BTC = SE.getBackedgeTakenCount(L);
4008+
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4009+
(SE.getSmallConstantMaxTripCount(L) > 0 &&
4010+
SE.getSmallConstantMaxTripCount(L) <= 32))
4011+
return;
4012+
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4013+
return;
4014+
4015+
int64_t Size = 0;
4016+
for (auto *BB : L->getBlocks()) {
4017+
for (auto &I : *BB) {
4018+
if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4019+
return;
4020+
SmallVector<const Value *, 4> Operands(I.operand_values());
4021+
Size +=
4022+
*TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
4023+
}
4024+
}
4025+
4026+
// Limit to loops with trip counts that are cheap to expand.
4027+
UP.SCEVExpansionBudget = 1;
4028+
4029+
// Try to unroll small, single block loops, if they have load/store
4030+
// dependencies, to expose more parallel memory access streams.
4031+
if (L->getHeader() != L->getLoopLatch() || Size > 8)
4032+
return;
4033+
4034+
SmallPtrSet<const SCEV *, 8> LoadPtrs;
4035+
SmallPtrSet<const SCEV *, 8> StorePtrs;
4036+
SmallPtrSet<Value *, 8> LoadedValues;
4037+
SmallVector<StoreInst *> Stores;
4038+
for (auto *BB : L->blocks()) {
4039+
for (auto &I : *BB) {
4040+
Value *Ptr = getLoadStorePointerOperand(&I);
4041+
if (!Ptr)
4042+
continue;
4043+
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4044+
if (SE.isLoopInvariant(PtrSCEV, L))
4045+
continue;
4046+
if (isa<LoadInst>(&I)) {
4047+
LoadPtrs.insert(PtrSCEV);
4048+
LoadedValues.insert(&I);
4049+
} else {
4050+
Stores.push_back(cast<StoreInst>(&I));
4051+
StorePtrs.insert(PtrSCEV);
4052+
}
4053+
}
4054+
}
4055+
4056+
// Try to find an unroll count that maximizes the use of the instruction
4057+
// window.
4058+
unsigned UC = std::max(16ll / Size, 2ll);
4059+
unsigned BestUC = 0;
4060+
while (UC <= 8 && UC * Size <= 48) {
4061+
if ((UC * Size % 16) == 0 || (BestUC * Size % 16) < (UC * Size % 16) % 16) {
4062+
BestUC = UC;
4063+
}
4064+
UC++;
4065+
}
4066+
4067+
if (BestUC == 0 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4068+
return LoadedValues.contains(SI->getOperand(0));
4069+
}))
4070+
return;
4071+
4072+
UP.Runtime = true;
4073+
UP.DefaultUnrollRuntimeCount = BestUC;
4074+
}
4075+
39924076
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
39934077
TTI::UnrollingPreferences &UP,
39944078
OptimizationRemarkEmitter *ORE) {
@@ -4010,6 +4094,12 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
40104094
EnableFalkorHWPFUnrollFix)
40114095
getFalkorUnrollingPreferences(L, SE, UP);
40124096

4097+
if (ST->getProcFamily() == AArch64Subtarget::AppleA14 ||
4098+
ST->getProcFamily() == AArch64Subtarget::AppleA15 ||
4099+
ST->getProcFamily() == AArch64Subtarget::AppleA16 ||
4100+
ST->getProcFamily() == AArch64Subtarget::AppleM4)
4101+
getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
4102+
40134103
// Scan the loop: don't unroll loops with calls as this could prevent
40144104
// inlining. Don't unroll vector loops either, as they don't benefit much from
40154105
// unrolling.

llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,91 @@ define void @small_load_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale) {
1212
; APPLE-LABEL: define void @small_load_store_loop(
1313
; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] {
1414
; APPLE-NEXT: [[ENTRY:.*]]:
15+
; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
16+
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 7
17+
; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
18+
; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
19+
; APPLE: [[ENTRY_NEW]]:
20+
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
1521
; APPLE-NEXT: br label %[[LOOP:.*]]
1622
; APPLE: [[LOOP]]:
17-
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
23+
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ]
24+
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ]
1825
; APPLE-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_EPIL]], [[SCALE]]
1926
; APPLE-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
2027
; APPLE-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
2128
; APPLE-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL]]
2229
; APPLE-NEXT: store float [[L_EPIL]], ptr [[GEP_DST_EPIL]], align 4
23-
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
24-
; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
25-
; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
30+
; APPLE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
31+
; APPLE-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT_EPIL]], [[SCALE]]
32+
; APPLE-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
33+
; APPLE-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
34+
; APPLE-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
35+
; APPLE-NEXT: store float [[L_1]], ptr [[GEP_DST_1]], align 4
36+
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
37+
; APPLE-NEXT: [[SCALED_IV_2:%.*]] = mul nuw nsw i64 [[IV_NEXT_1]], [[SCALE]]
38+
; APPLE-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_2]]
39+
; APPLE-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
40+
; APPLE-NEXT: [[GEP_DST_2:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_1]]
41+
; APPLE-NEXT: store float [[L_2]], ptr [[GEP_DST_2]], align 4
42+
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3
43+
; APPLE-NEXT: [[SCALED_IV_3:%.*]] = mul nuw nsw i64 [[IV_NEXT_2]], [[SCALE]]
44+
; APPLE-NEXT: [[GEP_SRC_3:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_3]]
45+
; APPLE-NEXT: [[L_3:%.*]] = load float, ptr [[GEP_SRC_3]], align 4
46+
; APPLE-NEXT: [[GEP_DST_3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_2]]
47+
; APPLE-NEXT: store float [[L_3]], ptr [[GEP_DST_3]], align 4
48+
; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV_EPIL]], 4
49+
; APPLE-NEXT: [[SCALED_IV_4:%.*]] = mul nuw nsw i64 [[IV_NEXT_3]], [[SCALE]]
50+
; APPLE-NEXT: [[GEP_SRC_4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_4]]
51+
; APPLE-NEXT: [[L_4:%.*]] = load float, ptr [[GEP_SRC_4]], align 4
52+
; APPLE-NEXT: [[GEP_DST_4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_3]]
53+
; APPLE-NEXT: store float [[L_4]], ptr [[GEP_DST_4]], align 4
54+
; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV_EPIL]], 5
55+
; APPLE-NEXT: [[SCALED_IV_5:%.*]] = mul nuw nsw i64 [[IV_NEXT_4]], [[SCALE]]
56+
; APPLE-NEXT: [[GEP_SRC_5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_5]]
57+
; APPLE-NEXT: [[L_5:%.*]] = load float, ptr [[GEP_SRC_5]], align 4
58+
; APPLE-NEXT: [[GEP_DST_5:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_4]]
59+
; APPLE-NEXT: store float [[L_5]], ptr [[GEP_DST_5]], align 4
60+
; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV_EPIL]], 6
61+
; APPLE-NEXT: [[SCALED_IV_6:%.*]] = mul nuw nsw i64 [[IV_NEXT_5]], [[SCALE]]
62+
; APPLE-NEXT: [[GEP_SRC_6:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_6]]
63+
; APPLE-NEXT: [[L_6:%.*]] = load float, ptr [[GEP_SRC_6]], align 4
64+
; APPLE-NEXT: [[GEP_DST_6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_5]]
65+
; APPLE-NEXT: store float [[L_6]], ptr [[GEP_DST_6]], align 4
66+
; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV_EPIL]], 7
67+
; APPLE-NEXT: [[SCALED_IV_7:%.*]] = mul nuw nsw i64 [[IV_NEXT_6]], [[SCALE]]
68+
; APPLE-NEXT: [[GEP_SRC_7:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_7]]
69+
; APPLE-NEXT: [[L_7:%.*]] = load float, ptr [[GEP_SRC_7]], align 4
70+
; APPLE-NEXT: [[GEP_DST_7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_6]]
71+
; APPLE-NEXT: store float [[L_7]], ptr [[GEP_DST_7]], align 4
72+
; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV_EPIL]], 8
73+
; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
74+
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
75+
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
76+
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
77+
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ]
78+
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
79+
; APPLE: [[EXIT_UNR_LCSSA]]:
80+
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
81+
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
82+
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
83+
; APPLE: [[LOOP_EPIL_PREHEADER]]:
84+
; APPLE-NEXT: br label %[[LOOP_EPIL:.*]]
85+
; APPLE: [[LOOP_EPIL]]:
86+
; APPLE-NEXT: [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL1:%.*]], %[[LOOP_EPIL]] ]
87+
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
88+
; APPLE-NEXT: [[SCALED_IV_EPIL1:%.*]] = mul nuw nsw i64 [[IV_EPIL1]], [[SCALE]]
89+
; APPLE-NEXT: [[GEP_SRC_EPIL1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL1]]
90+
; APPLE-NEXT: [[L_EPIL1:%.*]] = load float, ptr [[GEP_SRC_EPIL1]], align 4
91+
; APPLE-NEXT: [[GEP_DST_EPIL1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL1]]
92+
; APPLE-NEXT: store float [[L_EPIL1]], ptr [[GEP_DST_EPIL1]], align 4
93+
; APPLE-NEXT: [[IV_NEXT_EPIL1]] = add nuw nsw i64 [[IV_EPIL1]], 1
94+
; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL1]], [[N]]
95+
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
96+
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
97+
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
98+
; APPLE: [[EXIT_EPILOG_LCSSA]]:
99+
; APPLE-NEXT: br label %[[EXIT]]
26100
; APPLE: [[EXIT]]:
27101
; APPLE-NEXT: ret void
28102
;

0 commit comments

Comments
 (0)