Skip to content

Commit d486b76

Browse files
authored
[AArch64] Unroll some loops with early-continues on Apple Silicon. (llvm#118499)
Try to runtime-unroll loops with early-continues depending on loop-varying loads; this helps with branch-prediction for the early-continues and can significantly improve performance for such loops Builds on top of llvm#118317. PR: llvm#118499.
1 parent 6261599 commit d486b76

File tree

2 files changed

+305
-42
lines changed

2 files changed

+305
-42
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 72 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -4085,51 +4085,86 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
40854085

40864086
// Try to unroll small, single block loops, if they have load/store
40874087
// dependencies, to expose more parallel memory access streams.
4088-
if (L->getHeader() != L->getLoopLatch() || Size > 8)
4089-
return;
4088+
BasicBlock *Header = L->getHeader();
4089+
if (Header == L->getLoopLatch()) {
4090+
if (Size > 8)
4091+
return;
40904092

4091-
SmallPtrSet<Value *, 8> LoadedValues;
4092-
SmallVector<StoreInst *> Stores;
4093-
for (auto *BB : L->blocks()) {
4094-
for (auto &I : *BB) {
4095-
Value *Ptr = getLoadStorePointerOperand(&I);
4096-
if (!Ptr)
4097-
continue;
4098-
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4099-
if (SE.isLoopInvariant(PtrSCEV, L))
4100-
continue;
4101-
if (isa<LoadInst>(&I))
4102-
LoadedValues.insert(&I);
4103-
else
4104-
Stores.push_back(cast<StoreInst>(&I));
4093+
SmallPtrSet<Value *, 8> LoadedValues;
4094+
SmallVector<StoreInst *> Stores;
4095+
for (auto *BB : L->blocks()) {
4096+
for (auto &I : *BB) {
4097+
Value *Ptr = getLoadStorePointerOperand(&I);
4098+
if (!Ptr)
4099+
continue;
4100+
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4101+
if (SE.isLoopInvariant(PtrSCEV, L))
4102+
continue;
4103+
if (isa<LoadInst>(&I))
4104+
LoadedValues.insert(&I);
4105+
else
4106+
Stores.push_back(cast<StoreInst>(&I));
4107+
}
41054108
}
4106-
}
41074109

4108-
// Try to find an unroll count that maximizes the use of the instruction
4109-
// window, i.e. trying to fetch as many instructions per cycle as possible.
4110-
unsigned MaxInstsPerLine = 16;
4111-
unsigned UC = 1;
4112-
unsigned BestUC = 1;
4113-
unsigned SizeWithBestUC = BestUC * Size;
4114-
while (UC <= 8) {
4115-
unsigned SizeWithUC = UC * Size;
4116-
if (SizeWithUC > 48)
4117-
break;
4118-
if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4119-
(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4120-
BestUC = UC;
4121-
SizeWithBestUC = BestUC * Size;
4110+
// Try to find an unroll count that maximizes the use of the instruction
4111+
// window, i.e. trying to fetch as many instructions per cycle as possible.
4112+
unsigned MaxInstsPerLine = 16;
4113+
unsigned UC = 1;
4114+
unsigned BestUC = 1;
4115+
unsigned SizeWithBestUC = BestUC * Size;
4116+
while (UC <= 8) {
4117+
unsigned SizeWithUC = UC * Size;
4118+
if (SizeWithUC > 48)
4119+
break;
4120+
if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4121+
(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4122+
BestUC = UC;
4123+
SizeWithBestUC = BestUC * Size;
4124+
}
4125+
UC++;
41224126
}
4123-
UC++;
4127+
4128+
if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4129+
return LoadedValues.contains(SI->getOperand(0));
4130+
}))
4131+
return;
4132+
4133+
UP.Runtime = true;
4134+
UP.DefaultUnrollRuntimeCount = BestUC;
4135+
return;
41244136
}
41254137

4126-
if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4127-
return LoadedValues.contains(SI->getOperand(0));
4128-
}))
4138+
// Try to runtime-unroll loops with early-continues depending on loop-varying
4139+
// loads; this helps with branch-prediction for the early-continues.
4140+
auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
4141+
auto *Latch = L->getLoopLatch();
4142+
SmallVector<BasicBlock *> Preds(predecessors(Latch));
4143+
if (!Term || !Term->isConditional() || Preds.size() == 1 ||
4144+
none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4145+
none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
41294146
return;
41304147

4131-
UP.Runtime = true;
4132-
UP.DefaultUnrollRuntimeCount = BestUC;
4148+
std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
4149+
[&](Instruction *I, unsigned Depth) -> bool {
4150+
if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
4151+
return false;
4152+
4153+
if (isa<LoadInst>(I))
4154+
return true;
4155+
4156+
return any_of(I->operands(), [&](Value *V) {
4157+
auto *I = dyn_cast<Instruction>(V);
4158+
return I && DependsOnLoopLoad(I, Depth + 1);
4159+
});
4160+
};
4161+
CmpPredicate Pred;
4162+
Instruction *I;
4163+
if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
4164+
m_Value())) &&
4165+
DependsOnLoopLoad(I, 0)) {
4166+
UP.Runtime = true;
4167+
}
41334168
}
41344169

41354170
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

0 commit comments

Comments
 (0)