@@ -3650,51 +3650,86 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
3650
3650
3651
3651
// Try to unroll small, single block loops, if they have load/store
3652
3652
// dependencies, to expose more parallel memory access streams.
3653
- if (L->getHeader () != L->getLoopLatch () || Size > 8 )
3654
- return ;
3653
+ BasicBlock *Header = L->getHeader ();
3654
+ if (Header == L->getLoopLatch ()) {
3655
+ if (Size > 8 )
3656
+ return ;
3655
3657
3656
- SmallPtrSet<Value *, 8 > LoadedValues;
3657
- SmallVector<StoreInst *> Stores;
3658
- for (auto *BB : L->blocks ()) {
3659
- for (auto &I : *BB) {
3660
- Value *Ptr = getLoadStorePointerOperand (&I);
3661
- if (!Ptr )
3662
- continue ;
3663
- const SCEV *PtrSCEV = SE.getSCEV (Ptr );
3664
- if (SE.isLoopInvariant (PtrSCEV, L))
3665
- continue ;
3666
- if (isa<LoadInst>(&I))
3667
- LoadedValues.insert (&I);
3668
- else
3669
- Stores.push_back (cast<StoreInst>(&I));
3658
+ SmallPtrSet<Value *, 8 > LoadedValues;
3659
+ SmallVector<StoreInst *> Stores;
3660
+ for (auto *BB : L->blocks ()) {
3661
+ for (auto &I : *BB) {
3662
+ Value *Ptr = getLoadStorePointerOperand (&I);
3663
+ if (!Ptr )
3664
+ continue ;
3665
+ const SCEV *PtrSCEV = SE.getSCEV (Ptr );
3666
+ if (SE.isLoopInvariant (PtrSCEV, L))
3667
+ continue ;
3668
+ if (isa<LoadInst>(&I))
3669
+ LoadedValues.insert (&I);
3670
+ else
3671
+ Stores.push_back (cast<StoreInst>(&I));
3672
+ }
3670
3673
}
3671
- }
3672
3674
3673
- // Try to find an unroll count that maximizes the use of the instruction
3674
- // window, i.e. trying to fetch as many instructions per cycle as possible.
3675
- unsigned MaxInstsPerLine = 16 ;
3676
- unsigned UC = 1 ;
3677
- unsigned BestUC = 1 ;
3678
- unsigned SizeWithBestUC = BestUC * Size ;
3679
- while (UC <= 8 ) {
3680
- unsigned SizeWithUC = UC * Size ;
3681
- if (SizeWithUC > 48 )
3682
- break ;
3683
- if ((SizeWithUC % MaxInstsPerLine) == 0 ||
3684
- (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
3685
- BestUC = UC;
3686
- SizeWithBestUC = BestUC * Size ;
3675
+ // Try to find an unroll count that maximizes the use of the instruction
3676
+ // window, i.e. trying to fetch as many instructions per cycle as possible.
3677
+ unsigned MaxInstsPerLine = 16 ;
3678
+ unsigned UC = 1 ;
3679
+ unsigned BestUC = 1 ;
3680
+ unsigned SizeWithBestUC = BestUC * Size ;
3681
+ while (UC <= 8 ) {
3682
+ unsigned SizeWithUC = UC * Size ;
3683
+ if (SizeWithUC > 48 )
3684
+ break ;
3685
+ if ((SizeWithUC % MaxInstsPerLine) == 0 ||
3686
+ (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
3687
+ BestUC = UC;
3688
+ SizeWithBestUC = BestUC * Size ;
3689
+ }
3690
+ UC++;
3687
3691
}
3688
- UC++;
3692
+
3693
+ if (BestUC == 1 || none_of (Stores, [&LoadedValues](StoreInst *SI) {
3694
+ return LoadedValues.contains (SI->getOperand (0 ));
3695
+ }))
3696
+ return ;
3697
+
3698
+ UP.Runtime = true ;
3699
+ UP.DefaultUnrollRuntimeCount = BestUC;
3700
+ return ;
3689
3701
}
3690
3702
3691
- if (BestUC == 1 || none_of (Stores, [&LoadedValues](StoreInst *SI) {
3692
- return LoadedValues.contains (SI->getOperand (0 ));
3693
- }))
3703
+ // Try to runtime-unroll loops with early-continues depending on loop-varying
3704
+ // loads; this helps with branch-prediction for the early-continues.
3705
+ auto *Term = dyn_cast<BranchInst>(Header->getTerminator ());
3706
+ auto *Latch = L->getLoopLatch ();
3707
+ SmallVector<BasicBlock *> Preds (predecessors (Latch));
3708
+ if (!Term || !Term->isConditional () || Preds.size () == 1 ||
3709
+ none_of (Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
3710
+ none_of (Preds, [L](BasicBlock *Pred) { return L->contains (Pred); }))
3694
3711
return ;
3695
3712
3696
- UP.Runtime = true ;
3697
- UP.DefaultUnrollRuntimeCount = BestUC;
3713
+ std::function<bool (Instruction *, unsigned )> DependsOnLoopLoad =
3714
+ [&](Instruction *I, unsigned Depth) -> bool {
3715
+ if (isa<PHINode>(I) || L->isLoopInvariant (I) || Depth > 8 )
3716
+ return false ;
3717
+
3718
+ if (isa<LoadInst>(I))
3719
+ return true ;
3720
+
3721
+ return any_of (I->operands (), [&](Value *V) {
3722
+ auto *I = dyn_cast<Instruction>(V);
3723
+ return I && DependsOnLoopLoad (I, Depth + 1 );
3724
+ });
3725
+ };
3726
+ CmpInst::Predicate Pred;
3727
+ Instruction *I;
3728
+ if (match (Term, m_Br (m_ICmp (Pred, m_Instruction (I), m_Value ()), m_Value (),
3729
+ m_Value ())) &&
3730
+ DependsOnLoopLoad (I, 0 )) {
3731
+ UP.Runtime = true ;
3732
+ }
3698
3733
}
3699
3734
3700
3735
void AArch64TTIImpl::getUnrollingPreferences (Loop *L, ScalarEvolution &SE,
0 commit comments