@@ -4085,51 +4085,86 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4085
4085
4086
4086
// Try to unroll small, single block loops, if they have load/store
4087
4087
// dependencies, to expose more parallel memory access streams.
4088
- if (L->getHeader () != L->getLoopLatch () || Size > 8 )
4089
- return ;
4088
+ BasicBlock *Header = L->getHeader ();
4089
+ if (Header == L->getLoopLatch ()) {
4090
+ if (Size > 8 )
4091
+ return ;
4090
4092
4091
- SmallPtrSet<Value *, 8 > LoadedValues;
4092
- SmallVector<StoreInst *> Stores;
4093
- for (auto *BB : L->blocks ()) {
4094
- for (auto &I : *BB) {
4095
- Value *Ptr = getLoadStorePointerOperand (&I);
4096
- if (!Ptr )
4097
- continue ;
4098
- const SCEV *PtrSCEV = SE.getSCEV (Ptr );
4099
- if (SE.isLoopInvariant (PtrSCEV, L))
4100
- continue ;
4101
- if (isa<LoadInst>(&I))
4102
- LoadedValues.insert (&I);
4103
- else
4104
- Stores.push_back (cast<StoreInst>(&I));
4093
+ SmallPtrSet<Value *, 8 > LoadedValues;
4094
+ SmallVector<StoreInst *> Stores;
4095
+ for (auto *BB : L->blocks ()) {
4096
+ for (auto &I : *BB) {
4097
+ Value *Ptr = getLoadStorePointerOperand (&I);
4098
+ if (!Ptr )
4099
+ continue ;
4100
+ const SCEV *PtrSCEV = SE.getSCEV (Ptr );
4101
+ if (SE.isLoopInvariant (PtrSCEV, L))
4102
+ continue ;
4103
+ if (isa<LoadInst>(&I))
4104
+ LoadedValues.insert (&I);
4105
+ else
4106
+ Stores.push_back (cast<StoreInst>(&I));
4107
+ }
4105
4108
}
4106
- }
4107
4109
4108
- // Try to find an unroll count that maximizes the use of the instruction
4109
- // window, i.e. trying to fetch as many instructions per cycle as possible.
4110
- unsigned MaxInstsPerLine = 16 ;
4111
- unsigned UC = 1 ;
4112
- unsigned BestUC = 1 ;
4113
- unsigned SizeWithBestUC = BestUC * Size ;
4114
- while (UC <= 8 ) {
4115
- unsigned SizeWithUC = UC * Size ;
4116
- if (SizeWithUC > 48 )
4117
- break ;
4118
- if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4119
- (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4120
- BestUC = UC;
4121
- SizeWithBestUC = BestUC * Size ;
4110
+ // Try to find an unroll count that maximizes the use of the instruction
4111
+ // window, i.e. trying to fetch as many instructions per cycle as possible.
4112
+ unsigned MaxInstsPerLine = 16 ;
4113
+ unsigned UC = 1 ;
4114
+ unsigned BestUC = 1 ;
4115
+ unsigned SizeWithBestUC = BestUC * Size ;
4116
+ while (UC <= 8 ) {
4117
+ unsigned SizeWithUC = UC * Size ;
4118
+ if (SizeWithUC > 48 )
4119
+ break ;
4120
+ if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4121
+ (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4122
+ BestUC = UC;
4123
+ SizeWithBestUC = BestUC * Size ;
4124
+ }
4125
+ UC++;
4122
4126
}
4123
- UC++;
4127
+
4128
+ if (BestUC == 1 || none_of (Stores, [&LoadedValues](StoreInst *SI) {
4129
+ return LoadedValues.contains (SI->getOperand (0 ));
4130
+ }))
4131
+ return ;
4132
+
4133
+ UP.Runtime = true ;
4134
+ UP.DefaultUnrollRuntimeCount = BestUC;
4135
+ return ;
4124
4136
}
4125
4137
4126
- if (BestUC == 1 || none_of (Stores, [&LoadedValues](StoreInst *SI) {
4127
- return LoadedValues.contains (SI->getOperand (0 ));
4128
- }))
4138
+ // Try to runtime-unroll loops with early-continues depending on loop-varying
4139
+ // loads; this helps with branch-prediction for the early-continues.
4140
+ auto *Term = dyn_cast<BranchInst>(Header->getTerminator ());
4141
+ auto *Latch = L->getLoopLatch ();
4142
+ SmallVector<BasicBlock *> Preds (predecessors (Latch));
4143
+ if (!Term || !Term->isConditional () || Preds.size () == 1 ||
4144
+ none_of (Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4145
+ none_of (Preds, [L](BasicBlock *Pred) { return L->contains (Pred); }))
4129
4146
return ;
4130
4147
4131
- UP.Runtime = true ;
4132
- UP.DefaultUnrollRuntimeCount = BestUC;
4148
+ std::function<bool (Instruction *, unsigned )> DependsOnLoopLoad =
4149
+ [&](Instruction *I, unsigned Depth) -> bool {
4150
+ if (isa<PHINode>(I) || L->isLoopInvariant (I) || Depth > 8 )
4151
+ return false ;
4152
+
4153
+ if (isa<LoadInst>(I))
4154
+ return true ;
4155
+
4156
+ return any_of (I->operands (), [&](Value *V) {
4157
+ auto *I = dyn_cast<Instruction>(V);
4158
+ return I && DependsOnLoopLoad (I, Depth + 1 );
4159
+ });
4160
+ };
4161
+ CmpPredicate Pred;
4162
+ Instruction *I;
4163
+ if (match (Term, m_Br (m_ICmp (Pred, m_Instruction (I), m_Value ()), m_Value (),
4164
+ m_Value ())) &&
4165
+ DependsOnLoopLoad (I, 0 )) {
4166
+ UP.Runtime = true ;
4167
+ }
4133
4168
}
4134
4169
4135
4170
void AArch64TTIImpl::getUnrollingPreferences (Loop *L, ScalarEvolution &SE,
0 commit comments