Skip to content

Commit c78ddd3

Browse files
committed
[AArch64] Unroll some loops with early-continues on Apple Silicon. (llvm#118499)
Try to runtime-unroll loops with early-continues depending on loop-varying loads; this helps with branch-prediction for the early-continues and can significantly improve performance for such loops Builds on top of llvm#118317. PR: llvm#118499. (cherry picked from commit d486b76)
1 parent 96f9ad3 commit c78ddd3

File tree

2 files changed

+305
-42
lines changed

2 files changed

+305
-42
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 72 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3650,51 +3650,86 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
36503650

36513651
// Try to unroll small, single block loops, if they have load/store
36523652
// dependencies, to expose more parallel memory access streams.
3653-
if (L->getHeader() != L->getLoopLatch() || Size > 8)
3654-
return;
3653+
BasicBlock *Header = L->getHeader();
3654+
if (Header == L->getLoopLatch()) {
3655+
if (Size > 8)
3656+
return;
36553657

3656-
SmallPtrSet<Value *, 8> LoadedValues;
3657-
SmallVector<StoreInst *> Stores;
3658-
for (auto *BB : L->blocks()) {
3659-
for (auto &I : *BB) {
3660-
Value *Ptr = getLoadStorePointerOperand(&I);
3661-
if (!Ptr)
3662-
continue;
3663-
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
3664-
if (SE.isLoopInvariant(PtrSCEV, L))
3665-
continue;
3666-
if (isa<LoadInst>(&I))
3667-
LoadedValues.insert(&I);
3668-
else
3669-
Stores.push_back(cast<StoreInst>(&I));
3658+
SmallPtrSet<Value *, 8> LoadedValues;
3659+
SmallVector<StoreInst *> Stores;
3660+
for (auto *BB : L->blocks()) {
3661+
for (auto &I : *BB) {
3662+
Value *Ptr = getLoadStorePointerOperand(&I);
3663+
if (!Ptr)
3664+
continue;
3665+
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
3666+
if (SE.isLoopInvariant(PtrSCEV, L))
3667+
continue;
3668+
if (isa<LoadInst>(&I))
3669+
LoadedValues.insert(&I);
3670+
else
3671+
Stores.push_back(cast<StoreInst>(&I));
3672+
}
36703673
}
3671-
}
36723674

3673-
// Try to find an unroll count that maximizes the use of the instruction
3674-
// window, i.e. trying to fetch as many instructions per cycle as possible.
3675-
unsigned MaxInstsPerLine = 16;
3676-
unsigned UC = 1;
3677-
unsigned BestUC = 1;
3678-
unsigned SizeWithBestUC = BestUC * Size;
3679-
while (UC <= 8) {
3680-
unsigned SizeWithUC = UC * Size;
3681-
if (SizeWithUC > 48)
3682-
break;
3683-
if ((SizeWithUC % MaxInstsPerLine) == 0 ||
3684-
(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
3685-
BestUC = UC;
3686-
SizeWithBestUC = BestUC * Size;
3675+
// Try to find an unroll count that maximizes the use of the instruction
3676+
// window, i.e. trying to fetch as many instructions per cycle as possible.
3677+
unsigned MaxInstsPerLine = 16;
3678+
unsigned UC = 1;
3679+
unsigned BestUC = 1;
3680+
unsigned SizeWithBestUC = BestUC * Size;
3681+
while (UC <= 8) {
3682+
unsigned SizeWithUC = UC * Size;
3683+
if (SizeWithUC > 48)
3684+
break;
3685+
if ((SizeWithUC % MaxInstsPerLine) == 0 ||
3686+
(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
3687+
BestUC = UC;
3688+
SizeWithBestUC = BestUC * Size;
3689+
}
3690+
UC++;
36873691
}
3688-
UC++;
3692+
3693+
if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
3694+
return LoadedValues.contains(SI->getOperand(0));
3695+
}))
3696+
return;
3697+
3698+
UP.Runtime = true;
3699+
UP.DefaultUnrollRuntimeCount = BestUC;
3700+
return;
36893701
}
36903702

3691-
if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
3692-
return LoadedValues.contains(SI->getOperand(0));
3693-
}))
3703+
// Try to runtime-unroll loops with early-continues depending on loop-varying
3704+
// loads; this helps with branch-prediction for the early-continues.
3705+
auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
3706+
auto *Latch = L->getLoopLatch();
3707+
SmallVector<BasicBlock *> Preds(predecessors(Latch));
3708+
if (!Term || !Term->isConditional() || Preds.size() == 1 ||
3709+
none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
3710+
none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
36943711
return;
36953712

3696-
UP.Runtime = true;
3697-
UP.DefaultUnrollRuntimeCount = BestUC;
3713+
std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
3714+
[&](Instruction *I, unsigned Depth) -> bool {
3715+
if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
3716+
return false;
3717+
3718+
if (isa<LoadInst>(I))
3719+
return true;
3720+
3721+
return any_of(I->operands(), [&](Value *V) {
3722+
auto *I = dyn_cast<Instruction>(V);
3723+
return I && DependsOnLoopLoad(I, Depth + 1);
3724+
});
3725+
};
3726+
CmpInst::Predicate Pred;
3727+
Instruction *I;
3728+
if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
3729+
m_Value())) &&
3730+
DependsOnLoopLoad(I, 0)) {
3731+
UP.Runtime = true;
3732+
}
36983733
}
36993734

37003735
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

0 commit comments

Comments
 (0)