-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AArch64] Unroll some loops with early-continues on Apple Silicon. #118499
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesTry to runtime-unroll loops with early-continues depending on loop-varying Builds on top of #118317 Patch is 31.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118499.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5b333d33cffd52..7e7b18ffcc15b7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3989,6 +3989,133 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}
+/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
+/// OOO engine's wide instruction window and various predictors.
+static void
+getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
+ TargetTransformInfo::UnrollingPreferences &UP,
+ AArch64TTIImpl &TTI) {
+ // Limit loops with structure that is highly likely to benefit from runtime
+ // unrolling; that is we exclude outer loops, loops with multiple exits and
+ // many blocks (i.e. likely with complex control flow). Note that the
+ // heuristics here may be overly conservative and we err on the side of
+ // avoiding runtime unrolling rather than unroll excessively. They are all
+ // subject to further refinement.
+ if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
+ return;
+
+ const SCEV *BTC = SE.getBackedgeTakenCount(L);
+ if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
+ (SE.getSmallConstantMaxTripCount(L) > 0 &&
+ SE.getSmallConstantMaxTripCount(L) <= 32))
+ return;
+ if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+ return;
+
+ int64_t Size = 0;
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
+ return;
+ SmallVector<const Value *, 4> Operands(I.operand_values());
+ Size +=
+ *TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
+ }
+ }
+
+ // Limit to loops with trip counts that are cheap to expand.
+ UP.SCEVExpansionBudget = 1;
+
+ // Try to unroll small, single block loops, if they have load/store
+ // dependencies, to expose more parallel memory access streams.
+ BasicBlock *Header = L->getHeader();
+ if (Header == L->getLoopLatch()) {
+ if (Size > 8)
+ return;
+
+ SmallPtrSet<Value *, 8> LoadedValues;
+ SmallVector<StoreInst *> Stores;
+ for (auto *BB : L->blocks()) {
+ for (auto &I : *BB) {
+ Value *Ptr = getLoadStorePointerOperand(&I);
+ if (!Ptr)
+ continue;
+ const SCEV *PtrSCEV = SE.getSCEV(Ptr);
+ if (SE.isLoopInvariant(PtrSCEV, L))
+ continue;
+ if (isa<LoadInst>(&I))
+ LoadedValues.insert(&I);
+ else
+ Stores.push_back(cast<StoreInst>(&I));
+ }
+ }
+
+ // Try to find an unroll count that maximizes the use of the instruction
+ // window, i.e. trying to fetch as many instructions per cycle as possible.
+ unsigned MaxInstsPerLine = 16;
+ unsigned UC = 1;
+ unsigned BestUC = 1;
+ unsigned SizeWithBestUC = BestUC * Size;
+ while (UC <= 8) {
+ unsigned SizeWithUC = UC * Size;
+ if (SizeWithUC > 48)
+ break;
+ if ((SizeWithUC % MaxInstsPerLine) == 0 ||
+ (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
+ BestUC = UC;
+ SizeWithBestUC = BestUC * Size;
+ }
+ UC++;
+ }
+
+ if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
+ return LoadedValues.contains(SI->getOperand(0));
+ }))
+ return;
+
+ UP.Runtime = true;
+ UP.DefaultUnrollRuntimeCount = BestUC;
+ return;
+ }
+
+ // Try to runtime-unroll loops with early-continues depending on loop-varying
+ // loads; this helps with branch-prediction for the early-continues.
+ auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
+ auto *Latch = L->getLoopLatch();
+ SmallVector<BasicBlock *> Preds(predecessors(Latch));
+ if (!Term || !Term->isConditional() || Preds.size() == 1 ||
+ none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
+ none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
+ return;
+
+ std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
+ [&](Instruction *I, unsigned Depth) -> bool {
+ if (isa<PHINode>(I))
+ return false;
+
+ if (L->isLoopInvariant(I))
+ return false;
+
+ if (Depth > 8)
+ return false;
+
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ return true;
+
+ return any_of(I->operands(), [&](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && DependsOnLoopLoad(I, Depth + 1);
+ });
+ };
+ CmpInst::Predicate Pred;
+ Instruction *I;
+ if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
+ m_Value())) &&
+ DependsOnLoopLoad(I, 0)) {
+ UP.Runtime = true;
+ }
+}
+
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) {
@@ -4006,9 +4133,21 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
- if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
- EnableFalkorHWPFUnrollFix)
- getFalkorUnrollingPreferences(L, SE, UP);
+ // Apply subtarget-specific unrolling preferences.
+ switch (ST->getProcFamily()) {
+ case AArch64Subtarget::AppleA14:
+ case AArch64Subtarget::AppleA15:
+ case AArch64Subtarget::AppleA16:
+ case AArch64Subtarget::AppleM4:
+ getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
+ break;
+ case AArch64Subtarget::Falkor:
+ if (EnableFalkorHWPFUnrollFix)
+ getFalkorUnrollingPreferences(L, SE, UP);
+ break;
+ default:
+ break;
+ }
// Scan the loop: don't unroll loops with calls as this could prevent
// inlining. Don't unroll vector loops either, as they don't benefit much from
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
index deacec795fb03a..1a091e847ca345 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
@@ -12,17 +12,91 @@ define void @small_load_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale) {
; APPLE-LABEL: define void @small_load_store_loop(
; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] {
; APPLE-NEXT: [[ENTRY:.*]]:
+; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
+; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 7
+; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE: [[ENTRY_NEW]]:
+; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
; APPLE-NEXT: br label %[[LOOP:.*]]
; APPLE: [[LOOP]]:
-; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
+; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ]
+; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_EPIL]], [[SCALE]]
; APPLE-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
; APPLE-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
; APPLE-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL]]
; APPLE-NEXT: store float [[L_EPIL]], ptr [[GEP_DST_EPIL]], align 4
-; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
-; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
-; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT_EPIL]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
+; APPLE-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; APPLE-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
+; APPLE-NEXT: store float [[L_1]], ptr [[GEP_DST_1]], align 4
+; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
+; APPLE-NEXT: [[SCALED_IV_2:%.*]] = mul nuw nsw i64 [[IV_NEXT_1]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_2]]
+; APPLE-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; APPLE-NEXT: [[GEP_DST_2:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT: store float [[L_2]], ptr [[GEP_DST_2]], align 4
+; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3
+; APPLE-NEXT: [[SCALED_IV_3:%.*]] = mul nuw nsw i64 [[IV_NEXT_2]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_3:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_3]]
+; APPLE-NEXT: [[L_3:%.*]] = load float, ptr [[GEP_SRC_3]], align 4
+; APPLE-NEXT: [[GEP_DST_3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_2]]
+; APPLE-NEXT: store float [[L_3]], ptr [[GEP_DST_3]], align 4
+; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV_EPIL]], 4
+; APPLE-NEXT: [[SCALED_IV_4:%.*]] = mul nuw nsw i64 [[IV_NEXT_3]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_4]]
+; APPLE-NEXT: [[L_4:%.*]] = load float, ptr [[GEP_SRC_4]], align 4
+; APPLE-NEXT: [[GEP_DST_4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_3]]
+; APPLE-NEXT: store float [[L_4]], ptr [[GEP_DST_4]], align 4
+; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV_EPIL]], 5
+; APPLE-NEXT: [[SCALED_IV_5:%.*]] = mul nuw nsw i64 [[IV_NEXT_4]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_5]]
+; APPLE-NEXT: [[L_5:%.*]] = load float, ptr [[GEP_SRC_5]], align 4
+; APPLE-NEXT: [[GEP_DST_5:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_4]]
+; APPLE-NEXT: store float [[L_5]], ptr [[GEP_DST_5]], align 4
+; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV_EPIL]], 6
+; APPLE-NEXT: [[SCALED_IV_6:%.*]] = mul nuw nsw i64 [[IV_NEXT_5]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_6:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_6]]
+; APPLE-NEXT: [[L_6:%.*]] = load float, ptr [[GEP_SRC_6]], align 4
+; APPLE-NEXT: [[GEP_DST_6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_5]]
+; APPLE-NEXT: store float [[L_6]], ptr [[GEP_DST_6]], align 4
+; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV_EPIL]], 7
+; APPLE-NEXT: [[SCALED_IV_7:%.*]] = mul nuw nsw i64 [[IV_NEXT_6]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_7:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_7]]
+; APPLE-NEXT: [[L_7:%.*]] = load float, ptr [[GEP_SRC_7]], align 4
+; APPLE-NEXT: [[GEP_DST_7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_6]]
+; APPLE-NEXT: store float [[L_7]], ptr [[GEP_DST_7]], align 4
+; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV_EPIL]], 8
+; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
+; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ]
+; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
+; APPLE: [[EXIT_UNR_LCSSA]]:
+; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; APPLE: [[LOOP_EPIL_PREHEADER]]:
+; APPLE-NEXT: br label %[[LOOP_EPIL:.*]]
+; APPLE: [[LOOP_EPIL]]:
+; APPLE-NEXT: [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL1:%.*]], %[[LOOP_EPIL]] ]
+; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
+; APPLE-NEXT: [[SCALED_IV_EPIL1:%.*]] = mul nuw nsw i64 [[IV_EPIL1]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_EPIL1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL1]]
+; APPLE-NEXT: [[L_EPIL1:%.*]] = load float, ptr [[GEP_SRC_EPIL1]], align 4
+; APPLE-NEXT: [[GEP_DST_EPIL1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL1]]
+; APPLE-NEXT: store float [[L_EPIL1]], ptr [[GEP_DST_EPIL1]], align 4
+; APPLE-NEXT: [[IV_NEXT_EPIL1]] = add nuw nsw i64 [[IV_EPIL1]], 1
+; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL1]], [[N]]
+; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE: [[EXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT: br label %[[EXIT]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
@@ -99,13 +173,21 @@ define void @early_continue_dep_on_load_large(ptr %p.1, ptr %p.2, i64 %N, i32 %x
; APPLE-LABEL: define void @early_continue_dep_on_load_large(
; APPLE-SAME: ptr [[P_1:%.*]], ptr [[P_2:%.*]], i64 [[N:%.*]], i32 [[X:%.*]], i32 [[WIDTH:%.*]], i32 [[T_1:%.*]], i32 [[T_2:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*]]:
+; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
+; APPLE-NEXT: [[TMP1:%.*]] = add i64 [[N]], -2
+; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP0]], 3
+; APPLE-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 3
+; APPLE-NEXT: br i1 [[TMP2]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE: [[ENTRY_NEW]]:
+; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[TMP0]], [[XTRAITER]]
; APPLE-NEXT: br label %[[LOOP_HEADER:.*]]
; APPLE: [[LOOP_HEADER]]:
-; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_LATCH:.*]] ]
+; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 1, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP_LATCH_3:.*]] ]
+; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP_LATCH_3]] ]
; APPLE-NEXT: [[GEP_EPIL:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_EPIL]]
; APPLE-NEXT: [[L_1_EPIL:%.*]] = load i32, ptr [[GEP_EPIL]], align 4
; APPLE-NEXT: [[CMP6_NOT_EPIL:%.*]] = icmp sgt i32 [[L_1_EPIL]], [[T_1]]
-; APPLE-NEXT: br i1 [[CMP6_NOT_EPIL]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; APPLE-NEXT: br i1 [[CMP6_NOT_EPIL]], label %[[THEN:.*]], label %[[LOOP_LATCH:.*]]
; APPLE: [[THEN]]:
; APPLE-NEXT: [[GEP_4_EPIL:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_EPIL]], i64 4
; APPLE-NEXT: [[L_2_EPIL:%.*]] = load i8, ptr [[GEP_4_EPIL]], align 4
@@ -150,9 +232,224 @@ define void @early_continue_dep_on_load_large(ptr %p.1, ptr %p.2, i64 %N, i32 %x
; APPLE-NEXT: store i8 [[RES_EPIL]], ptr [[GEP_5_EPIL]], align 1
; APPLE-NEXT: br label %[[LOOP_LATCH]]
; APPLE: [[LOOP_LATCH]]:
-; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
-; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
-; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; APPLE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT: [[GEP_1:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_NEXT_EPIL]]
+; APPLE-NEXT: [[L_1_1:%.*]] = load i32, ptr [[GEP_1]], align 4
+; APPLE-NEXT: [[C_1_1:%.*]] = icmp sgt i32 [[L_1_1]], [[T_1]]
+; APPLE-NEXT: br i1 [[C_1_1]], label %[[THEN_1:.*]], label %[[LOOP_LATCH_1:.*]]
+; APPLE: [[THEN_1]]:
+; APPLE-NEXT: [[GEP_4_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_1]], i64 4
+; APPLE-NEXT: [[L_2_1:%.*]] = load i8, ptr [[GEP_4_1]], align 4
+; APPLE-NEXT: [[C_2_1:%.*]] = icmp ugt i8 [[L_2_1]], 7
+; APPLE-NEXT: br i1 [[C_2_1]], label %[[MERGE_11:.*]], label %[[ELSE_1:.*]]
+; APPLE: [[ELSE_1]]:
+; APPLE-NEXT: [[CONV_I_1:%.*]] = zext nneg i8 [[L_2_1]] to i64
+; APPLE-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds [9 x i8], ptr @A, i64 0, i64 [[CONV_I_1]]
+; APPLE-NEXT: [[L_3_1:%.*]] = load i8, ptr [[GEP_A_1]], align 1
+; APPLE-NEXT: [[IDXPROM_I_1:%.*]] = sext i8 [[L_3_1]] to i64
+; APPLE-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds [8 x i32], ptr @B, i64 0, i64 [[IDXPROM_I_1]]
+; APPLE-NEXT: [[L_4_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
+; APPLE-NEXT: [[GEP_C_1:%.*]] = getelementptr inbounds [8 x i32], ptr @C, i64 0, i64 [[IDXPROM_I_1]]
+; APPLE-NEXT: [[L_5_1:%.*]] = load i32, ptr [[GEP_C_1]], align 4
+; APPLE-NEXT: br label %[[MERGE_11]]
+; APPLE: [[MERGE_11]]:
+; APPLE-NEXT: [[MERGE_1_1:%.*]] = phi i32 [ 0, %[[THEN_1]] ], [ [[L_4_1]], %[[ELSE_1]] ]
+; APPLE-NEXT: [[MERGE_2_1:%.*]] = phi i32 [ 0, %[[THEN_1]] ], [ [[L_5_1]], %[[ELSE_1]] ]
+; APPLE-NEXT: [[ADD14_1:%.*]] = add nsw i32 [[MERGE_2_1]], [[X]]
+; APPLE-NEXT: [[MUL15_1:%.*]] = mul nsw i32 [[ADD14_1]], [[WIDTH]]
+; APPLE-NEXT: [[TMP4:%.*]] = trunc nuw nsw i64 [[IV_NEXT_EPIL]] to i32
+; APPLE-NEXT: [[ADD16_1:%.*]] = add nsw i32 [[MERGE_1_1]], [[TMP4]]
+; APPLE-NEXT: [[ADD17_1:%.*]] = add nsw i32 [[ADD16_1]], [[MUL15_1]]
+; APPLE-NEXT: [[IDXPROM18_1:%.*]] = sext i32 [[ADD17_1]] to i64
+; APPLE-NEXT: [[GEP_P_2_1:%.*]] = getelementptr inbounds { i32, i8, i8, [2 x i8] }, ptr [[P_2]], i64 [[IDXPROM18_1]]
+; APPLE-NEXT: [[L_6_1:%.*]] = load i32, ptr [[GEP_P_2_1]], align 4
+; APPLE-NEXT: [[SUB_1:%.*]] = sub nsw i32 [[X]], [[MERGE_2_1]]
+; APPLE-NEXT: [[MUL21_1:%.*]] = mul nsw i32 [[SUB_1]], [[WIDTH]]
+; APPLE-NEXT: [[SUB22_1:%.*]] = sub i32 [[TMP4]], [[MERGE_1_1]]
+; APPLE-NEXT: [[ADD23_1:%.*]] = add nsw i32 [[SUB22_1]], [[MUL21_1]]
+; APPLE-NEXT: [[IDXPROM24_1:%.*]] = sext i32 [[ADD23_1]] to i64
+; APPLE-NEXT: [[GEP_P2_1_1:%.*]] = getelementptr inbounds { i32, i8, i8, [2 x i8] }, ptr [[P_2]], i64 [[IDXPROM24_1]]
+; APPLE-NEXT: [[L_7_1:%.*]] = load i32, ptr [[GEP_P2_1_1]], align 4
+; APPLE-NEXT: [[C_3_1:%.*]] = icmp sgt i32 [[L_1_1]], [[L_6_1]]
+; APPLE-NEXT: [[C_4_1:%.*]] = icmp sgt i32 [[L_1_1]], [[L_7_1]]
+; APPLE-NEXT: [[AND34_1:%.*]] = and i1 [[C_3_1]], [[C_4_1]]
+; APPLE-NEXT: br i1 [[AND34_1]], label %[[STORE_RES_1:.*]], label %[[LOOP_LATCH_1]]
+; APPLE: [[STORE_RES_1]]:
+; APPLE-NEXT: [[C_5_1:%.*]] = icmp sgt i32 [[L_1_1]], [[T_2]]
+; APPLE-NEXT: [[GEP_5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_1]], i64 5
+; APPLE-NEXT: [[RES_1:%.*]] = select i1 [[C_5_1]], i8 1, i8 2
+; APPLE-NEXT: store i8 [[RES_1]], ptr [[GEP_5_1]], align 1
+; APPLE-NEXT: br label %[[LOOP_LATCH_1]]
+; APPLE: [[LOOP_LATCH_1]]:
+; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
+; APPLE-NEXT: [[GEP_2:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT: [[L_1_2:%.*]] = load i32, ptr [[GEP_2]], align 4
+; APPLE-NEXT: [[C_1_2:%.*]] = icmp sgt i32 [[L_1_2]], [[T_1]]
+; APPLE-NEXT: br i1 [[C_1_2]], label %[[THEN_2:.*]], label %[[LOOP_LATCH_2:.*]]
+; APPLE: [[THEN_2]]:
+; APPLE-NEXT: [[GEP_4_2:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_2]], i64 4
+; APPLE-NEXT: [[L_2_2:%.*]] = load i8, ptr [[GEP_4_2]], align 4
+; APPLE-NEXT: [[C_2_2:%.*]] = icmp ugt i8 [[L_2_2]], 7
+; APPLE-NEXT: br i1 [[C_2_2]], label %[[MERGE_22:.*]], label %[[ELSE_2:.*]]
+; APPLE: [[ELSE_2]]:
+; APPLE-NEXT: [[CONV_I_2:%.*]] = zext nneg i8 [[L_2_2]] to i64
+; APPLE-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds [9 x i8], ptr @A, i64 0, i64 [[CONV_I_2]]
+; APPLE-NEXT: [[L_3_2:%.*]] = load i8, ptr [[GEP_A_2]], align 1
+; APPLE-NEXT: [[IDXPROM_I_2:%.*]] = sext i8 [[L_3_2]] to i64
+; APPLE-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds [8 x i32], ptr @B, i64 0, i64 [[IDXPROM_I_2]]
+; ...
[truncated]
|
@llvm/pr-subscribers-backend-aarch64 Author: Florian Hahn (fhahn) ChangesTry to runtime-unroll loops with early-continues depending on loop-varying Builds on top of #118317 Patch is 31.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118499.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5b333d33cffd52..7e7b18ffcc15b7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3989,6 +3989,133 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}
+/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
+/// OOO engine's wide instruction window and various predictors.
+static void
+getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
+ TargetTransformInfo::UnrollingPreferences &UP,
+ AArch64TTIImpl &TTI) {
+ // Limit loops with structure that is highly likely to benefit from runtime
+ // unrolling; that is we exclude outer loops, loops with multiple exits and
+ // many blocks (i.e. likely with complex control flow). Note that the
+ // heuristics here may be overly conservative and we err on the side of
+ // avoiding runtime unrolling rather than unroll excessively. They are all
+ // subject to further refinement.
+ if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
+ return;
+
+ const SCEV *BTC = SE.getBackedgeTakenCount(L);
+ if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
+ (SE.getSmallConstantMaxTripCount(L) > 0 &&
+ SE.getSmallConstantMaxTripCount(L) <= 32))
+ return;
+ if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+ return;
+
+ int64_t Size = 0;
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
+ return;
+ SmallVector<const Value *, 4> Operands(I.operand_values());
+ Size +=
+ *TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
+ }
+ }
+
+ // Limit to loops with trip counts that are cheap to expand.
+ UP.SCEVExpansionBudget = 1;
+
+ // Try to unroll small, single block loops, if they have load/store
+ // dependencies, to expose more parallel memory access streams.
+ BasicBlock *Header = L->getHeader();
+ if (Header == L->getLoopLatch()) {
+ if (Size > 8)
+ return;
+
+ SmallPtrSet<Value *, 8> LoadedValues;
+ SmallVector<StoreInst *> Stores;
+ for (auto *BB : L->blocks()) {
+ for (auto &I : *BB) {
+ Value *Ptr = getLoadStorePointerOperand(&I);
+ if (!Ptr)
+ continue;
+ const SCEV *PtrSCEV = SE.getSCEV(Ptr);
+ if (SE.isLoopInvariant(PtrSCEV, L))
+ continue;
+ if (isa<LoadInst>(&I))
+ LoadedValues.insert(&I);
+ else
+ Stores.push_back(cast<StoreInst>(&I));
+ }
+ }
+
+ // Try to find an unroll count that maximizes the use of the instruction
+ // window, i.e. trying to fetch as many instructions per cycle as possible.
+ unsigned MaxInstsPerLine = 16;
+ unsigned UC = 1;
+ unsigned BestUC = 1;
+ unsigned SizeWithBestUC = BestUC * Size;
+ while (UC <= 8) {
+ unsigned SizeWithUC = UC * Size;
+ if (SizeWithUC > 48)
+ break;
+ if ((SizeWithUC % MaxInstsPerLine) == 0 ||
+ (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
+ BestUC = UC;
+ SizeWithBestUC = BestUC * Size;
+ }
+ UC++;
+ }
+
+ if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
+ return LoadedValues.contains(SI->getOperand(0));
+ }))
+ return;
+
+ UP.Runtime = true;
+ UP.DefaultUnrollRuntimeCount = BestUC;
+ return;
+ }
+
+ // Try to runtime-unroll loops with early-continues depending on loop-varying
+ // loads; this helps with branch-prediction for the early-continues.
+ auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
+ auto *Latch = L->getLoopLatch();
+ SmallVector<BasicBlock *> Preds(predecessors(Latch));
+ if (!Term || !Term->isConditional() || Preds.size() == 1 ||
+ none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
+ none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
+ return;
+
+ std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
+ [&](Instruction *I, unsigned Depth) -> bool {
+ if (isa<PHINode>(I))
+ return false;
+
+ if (L->isLoopInvariant(I))
+ return false;
+
+ if (Depth > 8)
+ return false;
+
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ return true;
+
+ return any_of(I->operands(), [&](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && DependsOnLoopLoad(I, Depth + 1);
+ });
+ };
+ CmpInst::Predicate Pred;
+ Instruction *I;
+ if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
+ m_Value())) &&
+ DependsOnLoopLoad(I, 0)) {
+ UP.Runtime = true;
+ }
+}
+
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) {
@@ -4006,9 +4133,21 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
- if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
- EnableFalkorHWPFUnrollFix)
- getFalkorUnrollingPreferences(L, SE, UP);
+ // Apply subtarget-specific unrolling preferences.
+ switch (ST->getProcFamily()) {
+ case AArch64Subtarget::AppleA14:
+ case AArch64Subtarget::AppleA15:
+ case AArch64Subtarget::AppleA16:
+ case AArch64Subtarget::AppleM4:
+ getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
+ break;
+ case AArch64Subtarget::Falkor:
+ if (EnableFalkorHWPFUnrollFix)
+ getFalkorUnrollingPreferences(L, SE, UP);
+ break;
+ default:
+ break;
+ }
// Scan the loop: don't unroll loops with calls as this could prevent
// inlining. Don't unroll vector loops either, as they don't benefit much from
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
index deacec795fb03a..1a091e847ca345 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
@@ -12,17 +12,91 @@ define void @small_load_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale) {
; APPLE-LABEL: define void @small_load_store_loop(
; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] {
; APPLE-NEXT: [[ENTRY:.*]]:
+; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
+; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 7
+; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE: [[ENTRY_NEW]]:
+; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
; APPLE-NEXT: br label %[[LOOP:.*]]
; APPLE: [[LOOP]]:
-; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
+; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ]
+; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_EPIL]], [[SCALE]]
; APPLE-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
; APPLE-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
; APPLE-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL]]
; APPLE-NEXT: store float [[L_EPIL]], ptr [[GEP_DST_EPIL]], align 4
-; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
-; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
-; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT_EPIL]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
+; APPLE-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; APPLE-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
+; APPLE-NEXT: store float [[L_1]], ptr [[GEP_DST_1]], align 4
+; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
+; APPLE-NEXT: [[SCALED_IV_2:%.*]] = mul nuw nsw i64 [[IV_NEXT_1]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_2]]
+; APPLE-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; APPLE-NEXT: [[GEP_DST_2:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT: store float [[L_2]], ptr [[GEP_DST_2]], align 4
+; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3
+; APPLE-NEXT: [[SCALED_IV_3:%.*]] = mul nuw nsw i64 [[IV_NEXT_2]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_3:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_3]]
+; APPLE-NEXT: [[L_3:%.*]] = load float, ptr [[GEP_SRC_3]], align 4
+; APPLE-NEXT: [[GEP_DST_3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_2]]
+; APPLE-NEXT: store float [[L_3]], ptr [[GEP_DST_3]], align 4
+; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV_EPIL]], 4
+; APPLE-NEXT: [[SCALED_IV_4:%.*]] = mul nuw nsw i64 [[IV_NEXT_3]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_4]]
+; APPLE-NEXT: [[L_4:%.*]] = load float, ptr [[GEP_SRC_4]], align 4
+; APPLE-NEXT: [[GEP_DST_4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_3]]
+; APPLE-NEXT: store float [[L_4]], ptr [[GEP_DST_4]], align 4
+; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV_EPIL]], 5
+; APPLE-NEXT: [[SCALED_IV_5:%.*]] = mul nuw nsw i64 [[IV_NEXT_4]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_5]]
+; APPLE-NEXT: [[L_5:%.*]] = load float, ptr [[GEP_SRC_5]], align 4
+; APPLE-NEXT: [[GEP_DST_5:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_4]]
+; APPLE-NEXT: store float [[L_5]], ptr [[GEP_DST_5]], align 4
+; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV_EPIL]], 6
+; APPLE-NEXT: [[SCALED_IV_6:%.*]] = mul nuw nsw i64 [[IV_NEXT_5]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_6:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_6]]
+; APPLE-NEXT: [[L_6:%.*]] = load float, ptr [[GEP_SRC_6]], align 4
+; APPLE-NEXT: [[GEP_DST_6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_5]]
+; APPLE-NEXT: store float [[L_6]], ptr [[GEP_DST_6]], align 4
+; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV_EPIL]], 7
+; APPLE-NEXT: [[SCALED_IV_7:%.*]] = mul nuw nsw i64 [[IV_NEXT_6]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_7:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_7]]
+; APPLE-NEXT: [[L_7:%.*]] = load float, ptr [[GEP_SRC_7]], align 4
+; APPLE-NEXT: [[GEP_DST_7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_6]]
+; APPLE-NEXT: store float [[L_7]], ptr [[GEP_DST_7]], align 4
+; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV_EPIL]], 8
+; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
+; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ]
+; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
+; APPLE: [[EXIT_UNR_LCSSA]]:
+; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; APPLE: [[LOOP_EPIL_PREHEADER]]:
+; APPLE-NEXT: br label %[[LOOP_EPIL:.*]]
+; APPLE: [[LOOP_EPIL]]:
+; APPLE-NEXT: [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL1:%.*]], %[[LOOP_EPIL]] ]
+; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
+; APPLE-NEXT: [[SCALED_IV_EPIL1:%.*]] = mul nuw nsw i64 [[IV_EPIL1]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_EPIL1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL1]]
+; APPLE-NEXT: [[L_EPIL1:%.*]] = load float, ptr [[GEP_SRC_EPIL1]], align 4
+; APPLE-NEXT: [[GEP_DST_EPIL1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL1]]
+; APPLE-NEXT: store float [[L_EPIL1]], ptr [[GEP_DST_EPIL1]], align 4
+; APPLE-NEXT: [[IV_NEXT_EPIL1]] = add nuw nsw i64 [[IV_EPIL1]], 1
+; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL1]], [[N]]
+; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE: [[EXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT: br label %[[EXIT]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
@@ -99,13 +173,21 @@ define void @early_continue_dep_on_load_large(ptr %p.1, ptr %p.2, i64 %N, i32 %x
; APPLE-LABEL: define void @early_continue_dep_on_load_large(
; APPLE-SAME: ptr [[P_1:%.*]], ptr [[P_2:%.*]], i64 [[N:%.*]], i32 [[X:%.*]], i32 [[WIDTH:%.*]], i32 [[T_1:%.*]], i32 [[T_2:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*]]:
+; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
+; APPLE-NEXT: [[TMP1:%.*]] = add i64 [[N]], -2
+; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP0]], 3
+; APPLE-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 3
+; APPLE-NEXT: br i1 [[TMP2]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE: [[ENTRY_NEW]]:
+; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[TMP0]], [[XTRAITER]]
; APPLE-NEXT: br label %[[LOOP_HEADER:.*]]
; APPLE: [[LOOP_HEADER]]:
-; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_LATCH:.*]] ]
+; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 1, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP_LATCH_3:.*]] ]
+; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP_LATCH_3]] ]
; APPLE-NEXT: [[GEP_EPIL:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_EPIL]]
; APPLE-NEXT: [[L_1_EPIL:%.*]] = load i32, ptr [[GEP_EPIL]], align 4
; APPLE-NEXT: [[CMP6_NOT_EPIL:%.*]] = icmp sgt i32 [[L_1_EPIL]], [[T_1]]
-; APPLE-NEXT: br i1 [[CMP6_NOT_EPIL]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; APPLE-NEXT: br i1 [[CMP6_NOT_EPIL]], label %[[THEN:.*]], label %[[LOOP_LATCH:.*]]
; APPLE: [[THEN]]:
; APPLE-NEXT: [[GEP_4_EPIL:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_EPIL]], i64 4
; APPLE-NEXT: [[L_2_EPIL:%.*]] = load i8, ptr [[GEP_4_EPIL]], align 4
@@ -150,9 +232,224 @@ define void @early_continue_dep_on_load_large(ptr %p.1, ptr %p.2, i64 %N, i32 %x
; APPLE-NEXT: store i8 [[RES_EPIL]], ptr [[GEP_5_EPIL]], align 1
; APPLE-NEXT: br label %[[LOOP_LATCH]]
; APPLE: [[LOOP_LATCH]]:
-; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
-; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
-; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; APPLE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT: [[GEP_1:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_NEXT_EPIL]]
+; APPLE-NEXT: [[L_1_1:%.*]] = load i32, ptr [[GEP_1]], align 4
+; APPLE-NEXT: [[C_1_1:%.*]] = icmp sgt i32 [[L_1_1]], [[T_1]]
+; APPLE-NEXT: br i1 [[C_1_1]], label %[[THEN_1:.*]], label %[[LOOP_LATCH_1:.*]]
+; APPLE: [[THEN_1]]:
+; APPLE-NEXT: [[GEP_4_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_1]], i64 4
+; APPLE-NEXT: [[L_2_1:%.*]] = load i8, ptr [[GEP_4_1]], align 4
+; APPLE-NEXT: [[C_2_1:%.*]] = icmp ugt i8 [[L_2_1]], 7
+; APPLE-NEXT: br i1 [[C_2_1]], label %[[MERGE_11:.*]], label %[[ELSE_1:.*]]
+; APPLE: [[ELSE_1]]:
+; APPLE-NEXT: [[CONV_I_1:%.*]] = zext nneg i8 [[L_2_1]] to i64
+; APPLE-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds [9 x i8], ptr @A, i64 0, i64 [[CONV_I_1]]
+; APPLE-NEXT: [[L_3_1:%.*]] = load i8, ptr [[GEP_A_1]], align 1
+; APPLE-NEXT: [[IDXPROM_I_1:%.*]] = sext i8 [[L_3_1]] to i64
+; APPLE-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds [8 x i32], ptr @B, i64 0, i64 [[IDXPROM_I_1]]
+; APPLE-NEXT: [[L_4_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
+; APPLE-NEXT: [[GEP_C_1:%.*]] = getelementptr inbounds [8 x i32], ptr @C, i64 0, i64 [[IDXPROM_I_1]]
+; APPLE-NEXT: [[L_5_1:%.*]] = load i32, ptr [[GEP_C_1]], align 4
+; APPLE-NEXT: br label %[[MERGE_11]]
+; APPLE: [[MERGE_11]]:
+; APPLE-NEXT: [[MERGE_1_1:%.*]] = phi i32 [ 0, %[[THEN_1]] ], [ [[L_4_1]], %[[ELSE_1]] ]
+; APPLE-NEXT: [[MERGE_2_1:%.*]] = phi i32 [ 0, %[[THEN_1]] ], [ [[L_5_1]], %[[ELSE_1]] ]
+; APPLE-NEXT: [[ADD14_1:%.*]] = add nsw i32 [[MERGE_2_1]], [[X]]
+; APPLE-NEXT: [[MUL15_1:%.*]] = mul nsw i32 [[ADD14_1]], [[WIDTH]]
+; APPLE-NEXT: [[TMP4:%.*]] = trunc nuw nsw i64 [[IV_NEXT_EPIL]] to i32
+; APPLE-NEXT: [[ADD16_1:%.*]] = add nsw i32 [[MERGE_1_1]], [[TMP4]]
+; APPLE-NEXT: [[ADD17_1:%.*]] = add nsw i32 [[ADD16_1]], [[MUL15_1]]
+; APPLE-NEXT: [[IDXPROM18_1:%.*]] = sext i32 [[ADD17_1]] to i64
+; APPLE-NEXT: [[GEP_P_2_1:%.*]] = getelementptr inbounds { i32, i8, i8, [2 x i8] }, ptr [[P_2]], i64 [[IDXPROM18_1]]
+; APPLE-NEXT: [[L_6_1:%.*]] = load i32, ptr [[GEP_P_2_1]], align 4
+; APPLE-NEXT: [[SUB_1:%.*]] = sub nsw i32 [[X]], [[MERGE_2_1]]
+; APPLE-NEXT: [[MUL21_1:%.*]] = mul nsw i32 [[SUB_1]], [[WIDTH]]
+; APPLE-NEXT: [[SUB22_1:%.*]] = sub i32 [[TMP4]], [[MERGE_1_1]]
+; APPLE-NEXT: [[ADD23_1:%.*]] = add nsw i32 [[SUB22_1]], [[MUL21_1]]
+; APPLE-NEXT: [[IDXPROM24_1:%.*]] = sext i32 [[ADD23_1]] to i64
+; APPLE-NEXT: [[GEP_P2_1_1:%.*]] = getelementptr inbounds { i32, i8, i8, [2 x i8] }, ptr [[P_2]], i64 [[IDXPROM24_1]]
+; APPLE-NEXT: [[L_7_1:%.*]] = load i32, ptr [[GEP_P2_1_1]], align 4
+; APPLE-NEXT: [[C_3_1:%.*]] = icmp sgt i32 [[L_1_1]], [[L_6_1]]
+; APPLE-NEXT: [[C_4_1:%.*]] = icmp sgt i32 [[L_1_1]], [[L_7_1]]
+; APPLE-NEXT: [[AND34_1:%.*]] = and i1 [[C_3_1]], [[C_4_1]]
+; APPLE-NEXT: br i1 [[AND34_1]], label %[[STORE_RES_1:.*]], label %[[LOOP_LATCH_1]]
+; APPLE: [[STORE_RES_1]]:
+; APPLE-NEXT: [[C_5_1:%.*]] = icmp sgt i32 [[L_1_1]], [[T_2]]
+; APPLE-NEXT: [[GEP_5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_1]], i64 5
+; APPLE-NEXT: [[RES_1:%.*]] = select i1 [[C_5_1]], i8 1, i8 2
+; APPLE-NEXT: store i8 [[RES_1]], ptr [[GEP_5_1]], align 1
+; APPLE-NEXT: br label %[[LOOP_LATCH_1]]
+; APPLE: [[LOOP_LATCH_1]]:
+; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
+; APPLE-NEXT: [[GEP_2:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT: [[L_1_2:%.*]] = load i32, ptr [[GEP_2]], align 4
+; APPLE-NEXT: [[C_1_2:%.*]] = icmp sgt i32 [[L_1_2]], [[T_1]]
+; APPLE-NEXT: br i1 [[C_1_2]], label %[[THEN_2:.*]], label %[[LOOP_LATCH_2:.*]]
+; APPLE: [[THEN_2]]:
+; APPLE-NEXT: [[GEP_4_2:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_2]], i64 4
+; APPLE-NEXT: [[L_2_2:%.*]] = load i8, ptr [[GEP_4_2]], align 4
+; APPLE-NEXT: [[C_2_2:%.*]] = icmp ugt i8 [[L_2_2]], 7
+; APPLE-NEXT: br i1 [[C_2_2]], label %[[MERGE_22:.*]], label %[[ELSE_2:.*]]
+; APPLE: [[ELSE_2]]:
+; APPLE-NEXT: [[CONV_I_2:%.*]] = zext nneg i8 [[L_2_2]] to i64
+; APPLE-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds [9 x i8], ptr @A, i64 0, i64 [[CONV_I_2]]
+; APPLE-NEXT: [[L_3_2:%.*]] = load i8, ptr [[GEP_A_2]], align 1
+; APPLE-NEXT: [[IDXPROM_I_2:%.*]] = sext i8 [[L_3_2]] to i64
+; APPLE-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds [8 x i32], ptr @B, i64 0, i64 [[IDXPROM_I_2]]
+; ...
[truncated]
|
804196c
to
c2ab633
Compare
Rebased after landing #118317. Ping :) |
}; | ||
CmpInst::Predicate Pred; | ||
Instruction *I; | ||
if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what about a DependsOnLoopLoad(I, 0)
via the other ICmp
operand, would that be canonicalized away, or should we care about that here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For now I think most cases should be caught due to canonicalization. I'll look into extending it as follow up
if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8) | ||
return false; | ||
|
||
if (auto *LI = dyn_cast<LoadInst>(I)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
isa
, since the result isn't used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
couple of nits/questions, but otherwise LGTM
Try to runtime-unroll loops with early-continues depending on loop-varying loads; this helps with branch-prediction for the early-continues and can significantly improve performance for such loops
c2ab633
to
2fab33a
Compare
…lvm#118499) Try to runtime-unroll loops with early-continues depending on loop-varying loads; this helps with branch-prediction for the early-continues and can significantly improve performance for such loops Builds on top of llvm#118317. PR: llvm#118499. (cherry picked from commit d486b76)
…Silicon. (#118499) Try to runtime-unroll loops with early-continues depending on loop-varying loads; this helps with branch-prediction for the early-continues and can significantly improve performance for such loops Builds on top of llvm/llvm-project#118317. PR: llvm/llvm-project#118499.
Local branch amd-gfx 577c5c7 Merged main:1b9805c14dba into amd-gfx:0cab57531d07 Remote branch main d486b76 [AArch64] Unroll some loops with early-continues on Apple Silicon. (llvm#118499)
Try to runtime-unroll loops with early-continues depending on loop-varying
loads; this helps with branch-prediction for the early-continues and can
significantly improve performance for such loops
Builds on top of #118317