[AArch64] Unroll some loops with early-continues on Apple Silicon. #118499

fhahn · 2024-12-03T15:25:23Z

Try to runtime-unroll loops with early-continues depending on loop-varying
loads; this helps with branch-prediction for the early-continues and can
significantly improve performance for such loops

Builds on top of #118317

llvmbot · 2024-12-03T15:25:55Z

@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

Changes

Try to runtime-unroll loops with early-continues depending on loop-varying
loads; this helps with branch-prediction for the early-continues and can
significantly improve performance for such loops

Builds on top of #118317

Patch is 31.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118499.diff

2 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+142-3)
(modified) llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll (+311-9)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5b333d33cffd52..7e7b18ffcc15b7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3989,6 +3989,133 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   }
 }
 
+/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
+/// OOO engine's wide instruction window and various predictors.
+static void
+getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
+                                 TargetTransformInfo::UnrollingPreferences &UP,
+                                 AArch64TTIImpl &TTI) {
+  // Limit loops with structure that is highly likely to benefit from runtime
+  // unrolling; that is we exclude outer loops, loops with multiple exits and
+  // many blocks (i.e. likely with complex control flow). Note that the
+  // heuristics here may be overly conservative and we err on the side of
+  // avoiding runtime unrolling rather than unroll excessively. They are all
+  // subject to further refinement.
+  if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
+    return;
+
+  const SCEV *BTC = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
+      (SE.getSmallConstantMaxTripCount(L) > 0 &&
+       SE.getSmallConstantMaxTripCount(L) <= 32))
+    return;
+  if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+    return;
+
+  int64_t Size = 0;
+  for (auto *BB : L->getBlocks()) {
+    for (auto &I : *BB) {
+      if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
+        return;
+      SmallVector<const Value *, 4> Operands(I.operand_values());
+      Size +=
+          *TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
+    }
+  }
+
+  // Limit to loops with trip counts that are cheap to expand.
+  UP.SCEVExpansionBudget = 1;
+
+  // Try to unroll small, single block loops, if they have load/store
+  // dependencies, to expose more parallel memory access streams.
+  BasicBlock *Header = L->getHeader();
+  if (Header == L->getLoopLatch()) {
+    if (Size > 8)
+      return;
+
+    SmallPtrSet<Value *, 8> LoadedValues;
+    SmallVector<StoreInst *> Stores;
+    for (auto *BB : L->blocks()) {
+      for (auto &I : *BB) {
+        Value *Ptr = getLoadStorePointerOperand(&I);
+        if (!Ptr)
+          continue;
+        const SCEV *PtrSCEV = SE.getSCEV(Ptr);
+        if (SE.isLoopInvariant(PtrSCEV, L))
+          continue;
+        if (isa<LoadInst>(&I))
+          LoadedValues.insert(&I);
+        else
+          Stores.push_back(cast<StoreInst>(&I));
+      }
+    }
+
+    // Try to find an unroll count that maximizes the use of the instruction
+    // window, i.e. trying to fetch as many instructions per cycle as possible.
+    unsigned MaxInstsPerLine = 16;
+    unsigned UC = 1;
+    unsigned BestUC = 1;
+    unsigned SizeWithBestUC = BestUC * Size;
+    while (UC <= 8) {
+      unsigned SizeWithUC = UC * Size;
+      if (SizeWithUC > 48)
+        break;
+      if ((SizeWithUC % MaxInstsPerLine) == 0 ||
+          (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
+        BestUC = UC;
+        SizeWithBestUC = BestUC * Size;
+      }
+      UC++;
+    }
+
+    if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
+          return LoadedValues.contains(SI->getOperand(0));
+        }))
+      return;
+
+    UP.Runtime = true;
+    UP.DefaultUnrollRuntimeCount = BestUC;
+    return;
+  }
+
+  // Try to runtime-unroll loops with early-continues depending on loop-varying
+  // loads; this helps with branch-prediction for the early-continues.
+  auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
+  auto *Latch = L->getLoopLatch();
+  SmallVector<BasicBlock *> Preds(predecessors(Latch));
+  if (!Term || !Term->isConditional() || Preds.size() == 1 ||
+      none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
+      none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
+    return;
+
+  std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
+      [&](Instruction *I, unsigned Depth) -> bool {
+    if (isa<PHINode>(I))
+      return false;
+
+    if (L->isLoopInvariant(I))
+      return false;
+
+    if (Depth > 8)
+      return false;
+
+    if (auto *LI = dyn_cast<LoadInst>(I))
+      return true;
+
+    return any_of(I->operands(), [&](Value *V) {
+      auto *I = dyn_cast<Instruction>(V);
+      return I && DependsOnLoopLoad(I, Depth + 1);
+    });
+  };
+  CmpInst::Predicate Pred;
+  Instruction *I;
+  if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
+                       m_Value())) &&
+      DependsOnLoopLoad(I, 0)) {
+    UP.Runtime = true;
+  }
+}
+
 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP,
                                              OptimizationRemarkEmitter *ORE) {
@@ -4006,9 +4133,21 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   // Disable partial & runtime unrolling on -Os.
   UP.PartialOptSizeThreshold = 0;
 
-  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
-      EnableFalkorHWPFUnrollFix)
-    getFalkorUnrollingPreferences(L, SE, UP);
+  // Apply subtarget-specific unrolling preferences.
+  switch (ST->getProcFamily()) {
+  case AArch64Subtarget::AppleA14:
+  case AArch64Subtarget::AppleA15:
+  case AArch64Subtarget::AppleA16:
+  case AArch64Subtarget::AppleM4:
+    getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
+    break;
+  case AArch64Subtarget::Falkor:
+    if (EnableFalkorHWPFUnrollFix)
+      getFalkorUnrollingPreferences(L, SE, UP);
+    break;
+  default:
+    break;
+  }
 
   // Scan the loop: don't unroll loops with calls as this could prevent
   // inlining. Don't unroll vector loops either, as they don't benefit much from
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
index deacec795fb03a..1a091e847ca345 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
@@ -12,17 +12,91 @@ define void @small_load_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale) {
 ; APPLE-LABEL: define void @small_load_store_loop(
 ; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] {
 ; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 7
+; APPLE-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; APPLE-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE:       [[ENTRY_NEW]]:
+; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
 ; APPLE-NEXT:    br label %[[LOOP:.*]]
 ; APPLE:       [[LOOP]]:
-; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ]
 ; APPLE-NEXT:    [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_EPIL]], [[SCALE]]
 ; APPLE-NEXT:    [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
 ; APPLE-NEXT:    [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
 ; APPLE-NEXT:    [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL]]
 ; APPLE-NEXT:    store float [[L_EPIL]], ptr [[GEP_DST_EPIL]], align 4
-; APPLE-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
-; APPLE-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
-; APPLE-NEXT:    br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE-NEXT:    [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT:    [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT_EPIL]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
+; APPLE-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; APPLE-NEXT:    [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
+; APPLE-NEXT:    store float [[L_1]], ptr [[GEP_DST_1]], align 4
+; APPLE-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
+; APPLE-NEXT:    [[SCALED_IV_2:%.*]] = mul nuw nsw i64 [[IV_NEXT_1]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_2]]
+; APPLE-NEXT:    [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; APPLE-NEXT:    [[GEP_DST_2:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT:    store float [[L_2]], ptr [[GEP_DST_2]], align 4
+; APPLE-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3
+; APPLE-NEXT:    [[SCALED_IV_3:%.*]] = mul nuw nsw i64 [[IV_NEXT_2]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_3:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_3]]
+; APPLE-NEXT:    [[L_3:%.*]] = load float, ptr [[GEP_SRC_3]], align 4
+; APPLE-NEXT:    [[GEP_DST_3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_2]]
+; APPLE-NEXT:    store float [[L_3]], ptr [[GEP_DST_3]], align 4
+; APPLE-NEXT:    [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV_EPIL]], 4
+; APPLE-NEXT:    [[SCALED_IV_4:%.*]] = mul nuw nsw i64 [[IV_NEXT_3]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_4]]
+; APPLE-NEXT:    [[L_4:%.*]] = load float, ptr [[GEP_SRC_4]], align 4
+; APPLE-NEXT:    [[GEP_DST_4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_3]]
+; APPLE-NEXT:    store float [[L_4]], ptr [[GEP_DST_4]], align 4
+; APPLE-NEXT:    [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV_EPIL]], 5
+; APPLE-NEXT:    [[SCALED_IV_5:%.*]] = mul nuw nsw i64 [[IV_NEXT_4]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_5]]
+; APPLE-NEXT:    [[L_5:%.*]] = load float, ptr [[GEP_SRC_5]], align 4
+; APPLE-NEXT:    [[GEP_DST_5:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_4]]
+; APPLE-NEXT:    store float [[L_5]], ptr [[GEP_DST_5]], align 4
+; APPLE-NEXT:    [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV_EPIL]], 6
+; APPLE-NEXT:    [[SCALED_IV_6:%.*]] = mul nuw nsw i64 [[IV_NEXT_5]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_6:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_6]]
+; APPLE-NEXT:    [[L_6:%.*]] = load float, ptr [[GEP_SRC_6]], align 4
+; APPLE-NEXT:    [[GEP_DST_6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_5]]
+; APPLE-NEXT:    store float [[L_6]], ptr [[GEP_DST_6]], align 4
+; APPLE-NEXT:    [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV_EPIL]], 7
+; APPLE-NEXT:    [[SCALED_IV_7:%.*]] = mul nuw nsw i64 [[IV_NEXT_6]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_7:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_7]]
+; APPLE-NEXT:    [[L_7:%.*]] = load float, ptr [[GEP_SRC_7]], align 4
+; APPLE-NEXT:    [[GEP_DST_7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_6]]
+; APPLE-NEXT:    store float [[L_7]], ptr [[GEP_DST_7]], align 4
+; APPLE-NEXT:    [[IV_NEXT_7]] = add nuw nsw i64 [[IV_EPIL]], 8
+; APPLE-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; APPLE-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; APPLE-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ]
+; APPLE-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; APPLE:       [[EXIT_UNR_LCSSA]]:
+; APPLE-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; APPLE:       [[LOOP_EPIL_PREHEADER]]:
+; APPLE-NEXT:    br label %[[LOOP_EPIL:.*]]
+; APPLE:       [[LOOP_EPIL]]:
+; APPLE-NEXT:    [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL1:%.*]], %[[LOOP_EPIL]] ]
+; APPLE-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
+; APPLE-NEXT:    [[SCALED_IV_EPIL1:%.*]] = mul nuw nsw i64 [[IV_EPIL1]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_EPIL1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL1]]
+; APPLE-NEXT:    [[L_EPIL1:%.*]] = load float, ptr [[GEP_SRC_EPIL1]], align 4
+; APPLE-NEXT:    [[GEP_DST_EPIL1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL1]]
+; APPLE-NEXT:    store float [[L_EPIL1]], ptr [[GEP_DST_EPIL1]], align 4
+; APPLE-NEXT:    [[IV_NEXT_EPIL1]] = add nuw nsw i64 [[IV_EPIL1]], 1
+; APPLE-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL1]], [[N]]
+; APPLE-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE:       [[EXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT:    br label %[[EXIT]]
 ; APPLE:       [[EXIT]]:
 ; APPLE-NEXT:    ret void
 ;
@@ -99,13 +173,21 @@ define void @early_continue_dep_on_load_large(ptr %p.1, ptr %p.2, i64 %N, i32 %x
 ; APPLE-LABEL: define void @early_continue_dep_on_load_large(
 ; APPLE-SAME: ptr [[P_1:%.*]], ptr [[P_2:%.*]], i64 [[N:%.*]], i32 [[X:%.*]], i32 [[WIDTH:%.*]], i32 [[T_1:%.*]], i32 [[T_2:%.*]]) #[[ATTR0]] {
 ; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; APPLE-NEXT:    [[TMP1:%.*]] = add i64 [[N]], -2
+; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP0]], 3
+; APPLE-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 3
+; APPLE-NEXT:    br i1 [[TMP2]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE:       [[ENTRY_NEW]]:
+; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[TMP0]], [[XTRAITER]]
 ; APPLE-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; APPLE:       [[LOOP_HEADER]]:
-; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_LATCH:.*]] ]
+; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ 1, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP_LATCH_3:.*]] ]
+; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP_LATCH_3]] ]
 ; APPLE-NEXT:    [[GEP_EPIL:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_EPIL]]
 ; APPLE-NEXT:    [[L_1_EPIL:%.*]] = load i32, ptr [[GEP_EPIL]], align 4
 ; APPLE-NEXT:    [[CMP6_NOT_EPIL:%.*]] = icmp sgt i32 [[L_1_EPIL]], [[T_1]]
-; APPLE-NEXT:    br i1 [[CMP6_NOT_EPIL]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; APPLE-NEXT:    br i1 [[CMP6_NOT_EPIL]], label %[[THEN:.*]], label %[[LOOP_LATCH:.*]]
 ; APPLE:       [[THEN]]:
 ; APPLE-NEXT:    [[GEP_4_EPIL:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_EPIL]], i64 4
 ; APPLE-NEXT:    [[L_2_EPIL:%.*]] = load i8, ptr [[GEP_4_EPIL]], align 4
@@ -150,9 +232,224 @@ define void @early_continue_dep_on_load_large(ptr %p.1, ptr %p.2, i64 %N, i32 %x
 ; APPLE-NEXT:    store i8 [[RES_EPIL]], ptr [[GEP_5_EPIL]], align 1
 ; APPLE-NEXT:    br label %[[LOOP_LATCH]]
 ; APPLE:       [[LOOP_LATCH]]:
-; APPLE-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
-; APPLE-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
-; APPLE-NEXT:    br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; APPLE-NEXT:    [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT:    [[GEP_1:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_NEXT_EPIL]]
+; APPLE-NEXT:    [[L_1_1:%.*]] = load i32, ptr [[GEP_1]], align 4
+; APPLE-NEXT:    [[C_1_1:%.*]] = icmp sgt i32 [[L_1_1]], [[T_1]]
+; APPLE-NEXT:    br i1 [[C_1_1]], label %[[THEN_1:.*]], label %[[LOOP_LATCH_1:.*]]
+; APPLE:       [[THEN_1]]:
+; APPLE-NEXT:    [[GEP_4_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_1]], i64 4
+; APPLE-NEXT:    [[L_2_1:%.*]] = load i8, ptr [[GEP_4_1]], align 4
+; APPLE-NEXT:    [[C_2_1:%.*]] = icmp ugt i8 [[L_2_1]], 7
+; APPLE-NEXT:    br i1 [[C_2_1]], label %[[MERGE_11:.*]], label %[[ELSE_1:.*]]
+; APPLE:       [[ELSE_1]]:
+; APPLE-NEXT:    [[CONV_I_1:%.*]] = zext nneg i8 [[L_2_1]] to i64
+; APPLE-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds [9 x i8], ptr @A, i64 0, i64 [[CONV_I_1]]
+; APPLE-NEXT:    [[L_3_1:%.*]] = load i8, ptr [[GEP_A_1]], align 1
+; APPLE-NEXT:    [[IDXPROM_I_1:%.*]] = sext i8 [[L_3_1]] to i64
+; APPLE-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds [8 x i32], ptr @B, i64 0, i64 [[IDXPROM_I_1]]
+; APPLE-NEXT:    [[L_4_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
+; APPLE-NEXT:    [[GEP_C_1:%.*]] = getelementptr inbounds [8 x i32], ptr @C, i64 0, i64 [[IDXPROM_I_1]]
+; APPLE-NEXT:    [[L_5_1:%.*]] = load i32, ptr [[GEP_C_1]], align 4
+; APPLE-NEXT:    br label %[[MERGE_11]]
+; APPLE:       [[MERGE_11]]:
+; APPLE-NEXT:    [[MERGE_1_1:%.*]] = phi i32 [ 0, %[[THEN_1]] ], [ [[L_4_1]], %[[ELSE_1]] ]
+; APPLE-NEXT:    [[MERGE_2_1:%.*]] = phi i32 [ 0, %[[THEN_1]] ], [ [[L_5_1]], %[[ELSE_1]] ]
+; APPLE-NEXT:    [[ADD14_1:%.*]] = add nsw i32 [[MERGE_2_1]], [[X]]
+; APPLE-NEXT:    [[MUL15_1:%.*]] = mul nsw i32 [[ADD14_1]], [[WIDTH]]
+; APPLE-NEXT:    [[TMP4:%.*]] = trunc nuw nsw i64 [[IV_NEXT_EPIL]] to i32
+; APPLE-NEXT:    [[ADD16_1:%.*]] = add nsw i32 [[MERGE_1_1]], [[TMP4]]
+; APPLE-NEXT:    [[ADD17_1:%.*]] = add nsw i32 [[ADD16_1]], [[MUL15_1]]
+; APPLE-NEXT:    [[IDXPROM18_1:%.*]] = sext i32 [[ADD17_1]] to i64
+; APPLE-NEXT:    [[GEP_P_2_1:%.*]] = getelementptr inbounds { i32, i8, i8, [2 x i8] }, ptr [[P_2]], i64 [[IDXPROM18_1]]
+; APPLE-NEXT:    [[L_6_1:%.*]] = load i32, ptr [[GEP_P_2_1]], align 4
+; APPLE-NEXT:    [[SUB_1:%.*]] = sub nsw i32 [[X]], [[MERGE_2_1]]
+; APPLE-NEXT:    [[MUL21_1:%.*]] = mul nsw i32 [[SUB_1]], [[WIDTH]]
+; APPLE-NEXT:    [[SUB22_1:%.*]] = sub i32 [[TMP4]], [[MERGE_1_1]]
+; APPLE-NEXT:    [[ADD23_1:%.*]] = add nsw i32 [[SUB22_1]], [[MUL21_1]]
+; APPLE-NEXT:    [[IDXPROM24_1:%.*]] = sext i32 [[ADD23_1]] to i64
+; APPLE-NEXT:    [[GEP_P2_1_1:%.*]] = getelementptr inbounds { i32, i8, i8, [2 x i8] }, ptr [[P_2]], i64 [[IDXPROM24_1]]
+; APPLE-NEXT:    [[L_7_1:%.*]] = load i32, ptr [[GEP_P2_1_1]], align 4
+; APPLE-NEXT:    [[C_3_1:%.*]] = icmp sgt i32 [[L_1_1]], [[L_6_1]]
+; APPLE-NEXT:    [[C_4_1:%.*]] = icmp sgt i32 [[L_1_1]], [[L_7_1]]
+; APPLE-NEXT:    [[AND34_1:%.*]] = and i1 [[C_3_1]], [[C_4_1]]
+; APPLE-NEXT:    br i1 [[AND34_1]], label %[[STORE_RES_1:.*]], label %[[LOOP_LATCH_1]]
+; APPLE:       [[STORE_RES_1]]:
+; APPLE-NEXT:    [[C_5_1:%.*]] = icmp sgt i32 [[L_1_1]], [[T_2]]
+; APPLE-NEXT:    [[GEP_5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_1]], i64 5
+; APPLE-NEXT:    [[RES_1:%.*]] = select i1 [[C_5_1]], i8 1, i8 2
+; APPLE-NEXT:    store i8 [[RES_1]], ptr [[GEP_5_1]], align 1
+; APPLE-NEXT:    br label %[[LOOP_LATCH_1]]
+; APPLE:       [[LOOP_LATCH_1]]:
+; APPLE-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
+; APPLE-NEXT:    [[GEP_2:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT:    [[L_1_2:%.*]] = load i32, ptr [[GEP_2]], align 4
+; APPLE-NEXT:    [[C_1_2:%.*]] = icmp sgt i32 [[L_1_2]], [[T_1]]
+; APPLE-NEXT:    br i1 [[C_1_2]], label %[[THEN_2:.*]], label %[[LOOP_LATCH_2:.*]]
+; APPLE:       [[THEN_2]]:
+; APPLE-NEXT:    [[GEP_4_2:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_2]], i64 4
+; APPLE-NEXT:    [[L_2_2:%.*]] = load i8, ptr [[GEP_4_2]], align 4
+; APPLE-NEXT:    [[C_2_2:%.*]] = icmp ugt i8 [[L_2_2]], 7
+; APPLE-NEXT:    br i1 [[C_2_2]], label %[[MERGE_22:.*]], label %[[ELSE_2:.*]]
+; APPLE:       [[ELSE_2]]:
+; APPLE-NEXT:    [[CONV_I_2:%.*]] = zext nneg i8 [[L_2_2]] to i64
+; APPLE-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds [9 x i8], ptr @A, i64 0, i64 [[CONV_I_2]]
+; APPLE-NEXT:    [[L_3_2:%.*]] = load i8, ptr [[GEP_A_2]], align 1
+; APPLE-NEXT:    [[IDXPROM_I_2:%.*]] = sext i8 [[L_3_2]] to i64
+; APPLE-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds [8 x i32], ptr @B, i64 0, i64 [[IDXPROM_I_2]]
+; ...
[truncated]

llvmbot · 2024-12-03T15:25:56Z

@llvm/pr-subscribers-backend-aarch64

Author: Florian Hahn (fhahn)

Changes

Try to runtime-unroll loops with early-continues depending on loop-varying
loads; this helps with branch-prediction for the early-continues and can
significantly improve performance for such loops

Builds on top of #118317

Patch is 31.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118499.diff

2 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+142-3)
(modified) llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll (+311-9)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5b333d33cffd52..7e7b18ffcc15b7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3989,6 +3989,133 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   }
 }
 
+/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
+/// OOO engine's wide instruction window and various predictors.
+static void
+getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
+                                 TargetTransformInfo::UnrollingPreferences &UP,
+                                 AArch64TTIImpl &TTI) {
+  // Limit loops with structure that is highly likely to benefit from runtime
+  // unrolling; that is we exclude outer loops, loops with multiple exits and
+  // many blocks (i.e. likely with complex control flow). Note that the
+  // heuristics here may be overly conservative and we err on the side of
+  // avoiding runtime unrolling rather than unroll excessively. They are all
+  // subject to further refinement.
+  if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
+    return;
+
+  const SCEV *BTC = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
+      (SE.getSmallConstantMaxTripCount(L) > 0 &&
+       SE.getSmallConstantMaxTripCount(L) <= 32))
+    return;
+  if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+    return;
+
+  int64_t Size = 0;
+  for (auto *BB : L->getBlocks()) {
+    for (auto &I : *BB) {
+      if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
+        return;
+      SmallVector<const Value *, 4> Operands(I.operand_values());
+      Size +=
+          *TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
+    }
+  }
+
+  // Limit to loops with trip counts that are cheap to expand.
+  UP.SCEVExpansionBudget = 1;
+
+  // Try to unroll small, single block loops, if they have load/store
+  // dependencies, to expose more parallel memory access streams.
+  BasicBlock *Header = L->getHeader();
+  if (Header == L->getLoopLatch()) {
+    if (Size > 8)
+      return;
+
+    SmallPtrSet<Value *, 8> LoadedValues;
+    SmallVector<StoreInst *> Stores;
+    for (auto *BB : L->blocks()) {
+      for (auto &I : *BB) {
+        Value *Ptr = getLoadStorePointerOperand(&I);
+        if (!Ptr)
+          continue;
+        const SCEV *PtrSCEV = SE.getSCEV(Ptr);
+        if (SE.isLoopInvariant(PtrSCEV, L))
+          continue;
+        if (isa<LoadInst>(&I))
+          LoadedValues.insert(&I);
+        else
+          Stores.push_back(cast<StoreInst>(&I));
+      }
+    }
+
+    // Try to find an unroll count that maximizes the use of the instruction
+    // window, i.e. trying to fetch as many instructions per cycle as possible.
+    unsigned MaxInstsPerLine = 16;
+    unsigned UC = 1;
+    unsigned BestUC = 1;
+    unsigned SizeWithBestUC = BestUC * Size;
+    while (UC <= 8) {
+      unsigned SizeWithUC = UC * Size;
+      if (SizeWithUC > 48)
+        break;
+      if ((SizeWithUC % MaxInstsPerLine) == 0 ||
+          (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
+        BestUC = UC;
+        SizeWithBestUC = BestUC * Size;
+      }
+      UC++;
+    }
+
+    if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
+          return LoadedValues.contains(SI->getOperand(0));
+        }))
+      return;
+
+    UP.Runtime = true;
+    UP.DefaultUnrollRuntimeCount = BestUC;
+    return;
+  }
+
+  // Try to runtime-unroll loops with early-continues depending on loop-varying
+  // loads; this helps with branch-prediction for the early-continues.
+  auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
+  auto *Latch = L->getLoopLatch();
+  SmallVector<BasicBlock *> Preds(predecessors(Latch));
+  if (!Term || !Term->isConditional() || Preds.size() == 1 ||
+      none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
+      none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
+    return;
+
+  std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
+      [&](Instruction *I, unsigned Depth) -> bool {
+    if (isa<PHINode>(I))
+      return false;
+
+    if (L->isLoopInvariant(I))
+      return false;
+
+    if (Depth > 8)
+      return false;
+
+    if (auto *LI = dyn_cast<LoadInst>(I))
+      return true;
+
+    return any_of(I->operands(), [&](Value *V) {
+      auto *I = dyn_cast<Instruction>(V);
+      return I && DependsOnLoopLoad(I, Depth + 1);
+    });
+  };
+  CmpInst::Predicate Pred;
+  Instruction *I;
+  if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
+                       m_Value())) &&
+      DependsOnLoopLoad(I, 0)) {
+    UP.Runtime = true;
+  }
+}
+
 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP,
                                              OptimizationRemarkEmitter *ORE) {
@@ -4006,9 +4133,21 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   // Disable partial & runtime unrolling on -Os.
   UP.PartialOptSizeThreshold = 0;
 
-  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
-      EnableFalkorHWPFUnrollFix)
-    getFalkorUnrollingPreferences(L, SE, UP);
+  // Apply subtarget-specific unrolling preferences.
+  switch (ST->getProcFamily()) {
+  case AArch64Subtarget::AppleA14:
+  case AArch64Subtarget::AppleA15:
+  case AArch64Subtarget::AppleA16:
+  case AArch64Subtarget::AppleM4:
+    getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
+    break;
+  case AArch64Subtarget::Falkor:
+    if (EnableFalkorHWPFUnrollFix)
+      getFalkorUnrollingPreferences(L, SE, UP);
+    break;
+  default:
+    break;
+  }
 
   // Scan the loop: don't unroll loops with calls as this could prevent
   // inlining. Don't unroll vector loops either, as they don't benefit much from
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
index deacec795fb03a..1a091e847ca345 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
@@ -12,17 +12,91 @@ define void @small_load_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale) {
 ; APPLE-LABEL: define void @small_load_store_loop(
 ; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] {
 ; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 7
+; APPLE-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; APPLE-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE:       [[ENTRY_NEW]]:
+; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
 ; APPLE-NEXT:    br label %[[LOOP:.*]]
 ; APPLE:       [[LOOP]]:
-; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ]
 ; APPLE-NEXT:    [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_EPIL]], [[SCALE]]
 ; APPLE-NEXT:    [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
 ; APPLE-NEXT:    [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
 ; APPLE-NEXT:    [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL]]
 ; APPLE-NEXT:    store float [[L_EPIL]], ptr [[GEP_DST_EPIL]], align 4
-; APPLE-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
-; APPLE-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
-; APPLE-NEXT:    br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE-NEXT:    [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT:    [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT_EPIL]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
+; APPLE-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; APPLE-NEXT:    [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
+; APPLE-NEXT:    store float [[L_1]], ptr [[GEP_DST_1]], align 4
+; APPLE-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
+; APPLE-NEXT:    [[SCALED_IV_2:%.*]] = mul nuw nsw i64 [[IV_NEXT_1]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_2]]
+; APPLE-NEXT:    [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; APPLE-NEXT:    [[GEP_DST_2:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT:    store float [[L_2]], ptr [[GEP_DST_2]], align 4
+; APPLE-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3
+; APPLE-NEXT:    [[SCALED_IV_3:%.*]] = mul nuw nsw i64 [[IV_NEXT_2]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_3:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_3]]
+; APPLE-NEXT:    [[L_3:%.*]] = load float, ptr [[GEP_SRC_3]], align 4
+; APPLE-NEXT:    [[GEP_DST_3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_2]]
+; APPLE-NEXT:    store float [[L_3]], ptr [[GEP_DST_3]], align 4
+; APPLE-NEXT:    [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV_EPIL]], 4
+; APPLE-NEXT:    [[SCALED_IV_4:%.*]] = mul nuw nsw i64 [[IV_NEXT_3]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_4]]
+; APPLE-NEXT:    [[L_4:%.*]] = load float, ptr [[GEP_SRC_4]], align 4
+; APPLE-NEXT:    [[GEP_DST_4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_3]]
+; APPLE-NEXT:    store float [[L_4]], ptr [[GEP_DST_4]], align 4
+; APPLE-NEXT:    [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV_EPIL]], 5
+; APPLE-NEXT:    [[SCALED_IV_5:%.*]] = mul nuw nsw i64 [[IV_NEXT_4]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_5]]
+; APPLE-NEXT:    [[L_5:%.*]] = load float, ptr [[GEP_SRC_5]], align 4
+; APPLE-NEXT:    [[GEP_DST_5:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_4]]
+; APPLE-NEXT:    store float [[L_5]], ptr [[GEP_DST_5]], align 4
+; APPLE-NEXT:    [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV_EPIL]], 6
+; APPLE-NEXT:    [[SCALED_IV_6:%.*]] = mul nuw nsw i64 [[IV_NEXT_5]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_6:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_6]]
+; APPLE-NEXT:    [[L_6:%.*]] = load float, ptr [[GEP_SRC_6]], align 4
+; APPLE-NEXT:    [[GEP_DST_6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_5]]
+; APPLE-NEXT:    store float [[L_6]], ptr [[GEP_DST_6]], align 4
+; APPLE-NEXT:    [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV_EPIL]], 7
+; APPLE-NEXT:    [[SCALED_IV_7:%.*]] = mul nuw nsw i64 [[IV_NEXT_6]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_7:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_7]]
+; APPLE-NEXT:    [[L_7:%.*]] = load float, ptr [[GEP_SRC_7]], align 4
+; APPLE-NEXT:    [[GEP_DST_7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_6]]
+; APPLE-NEXT:    store float [[L_7]], ptr [[GEP_DST_7]], align 4
+; APPLE-NEXT:    [[IV_NEXT_7]] = add nuw nsw i64 [[IV_EPIL]], 8
+; APPLE-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; APPLE-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; APPLE-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ]
+; APPLE-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; APPLE:       [[EXIT_UNR_LCSSA]]:
+; APPLE-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; APPLE:       [[LOOP_EPIL_PREHEADER]]:
+; APPLE-NEXT:    br label %[[LOOP_EPIL:.*]]
+; APPLE:       [[LOOP_EPIL]]:
+; APPLE-NEXT:    [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL1:%.*]], %[[LOOP_EPIL]] ]
+; APPLE-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
+; APPLE-NEXT:    [[SCALED_IV_EPIL1:%.*]] = mul nuw nsw i64 [[IV_EPIL1]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_EPIL1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL1]]
+; APPLE-NEXT:    [[L_EPIL1:%.*]] = load float, ptr [[GEP_SRC_EPIL1]], align 4
+; APPLE-NEXT:    [[GEP_DST_EPIL1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL1]]
+; APPLE-NEXT:    store float [[L_EPIL1]], ptr [[GEP_DST_EPIL1]], align 4
+; APPLE-NEXT:    [[IV_NEXT_EPIL1]] = add nuw nsw i64 [[IV_EPIL1]], 1
+; APPLE-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL1]], [[N]]
+; APPLE-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE:       [[EXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT:    br label %[[EXIT]]
 ; APPLE:       [[EXIT]]:
 ; APPLE-NEXT:    ret void
 ;
@@ -99,13 +173,21 @@ define void @early_continue_dep_on_load_large(ptr %p.1, ptr %p.2, i64 %N, i32 %x
 ; APPLE-LABEL: define void @early_continue_dep_on_load_large(
 ; APPLE-SAME: ptr [[P_1:%.*]], ptr [[P_2:%.*]], i64 [[N:%.*]], i32 [[X:%.*]], i32 [[WIDTH:%.*]], i32 [[T_1:%.*]], i32 [[T_2:%.*]]) #[[ATTR0]] {
 ; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; APPLE-NEXT:    [[TMP1:%.*]] = add i64 [[N]], -2
+; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP0]], 3
+; APPLE-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 3
+; APPLE-NEXT:    br i1 [[TMP2]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE:       [[ENTRY_NEW]]:
+; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[TMP0]], [[XTRAITER]]
 ; APPLE-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; APPLE:       [[LOOP_HEADER]]:
-; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_LATCH:.*]] ]
+; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ 1, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP_LATCH_3:.*]] ]
+; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP_LATCH_3]] ]
 ; APPLE-NEXT:    [[GEP_EPIL:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_EPIL]]
 ; APPLE-NEXT:    [[L_1_EPIL:%.*]] = load i32, ptr [[GEP_EPIL]], align 4
 ; APPLE-NEXT:    [[CMP6_NOT_EPIL:%.*]] = icmp sgt i32 [[L_1_EPIL]], [[T_1]]
-; APPLE-NEXT:    br i1 [[CMP6_NOT_EPIL]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; APPLE-NEXT:    br i1 [[CMP6_NOT_EPIL]], label %[[THEN:.*]], label %[[LOOP_LATCH:.*]]
 ; APPLE:       [[THEN]]:
 ; APPLE-NEXT:    [[GEP_4_EPIL:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_EPIL]], i64 4
 ; APPLE-NEXT:    [[L_2_EPIL:%.*]] = load i8, ptr [[GEP_4_EPIL]], align 4
@@ -150,9 +232,224 @@ define void @early_continue_dep_on_load_large(ptr %p.1, ptr %p.2, i64 %N, i32 %x
 ; APPLE-NEXT:    store i8 [[RES_EPIL]], ptr [[GEP_5_EPIL]], align 1
 ; APPLE-NEXT:    br label %[[LOOP_LATCH]]
 ; APPLE:       [[LOOP_LATCH]]:
-; APPLE-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
-; APPLE-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
-; APPLE-NEXT:    br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; APPLE-NEXT:    [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT:    [[GEP_1:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_NEXT_EPIL]]
+; APPLE-NEXT:    [[L_1_1:%.*]] = load i32, ptr [[GEP_1]], align 4
+; APPLE-NEXT:    [[C_1_1:%.*]] = icmp sgt i32 [[L_1_1]], [[T_1]]
+; APPLE-NEXT:    br i1 [[C_1_1]], label %[[THEN_1:.*]], label %[[LOOP_LATCH_1:.*]]
+; APPLE:       [[THEN_1]]:
+; APPLE-NEXT:    [[GEP_4_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_1]], i64 4
+; APPLE-NEXT:    [[L_2_1:%.*]] = load i8, ptr [[GEP_4_1]], align 4
+; APPLE-NEXT:    [[C_2_1:%.*]] = icmp ugt i8 [[L_2_1]], 7
+; APPLE-NEXT:    br i1 [[C_2_1]], label %[[MERGE_11:.*]], label %[[ELSE_1:.*]]
+; APPLE:       [[ELSE_1]]:
+; APPLE-NEXT:    [[CONV_I_1:%.*]] = zext nneg i8 [[L_2_1]] to i64
+; APPLE-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds [9 x i8], ptr @A, i64 0, i64 [[CONV_I_1]]
+; APPLE-NEXT:    [[L_3_1:%.*]] = load i8, ptr [[GEP_A_1]], align 1
+; APPLE-NEXT:    [[IDXPROM_I_1:%.*]] = sext i8 [[L_3_1]] to i64
+; APPLE-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds [8 x i32], ptr @B, i64 0, i64 [[IDXPROM_I_1]]
+; APPLE-NEXT:    [[L_4_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
+; APPLE-NEXT:    [[GEP_C_1:%.*]] = getelementptr inbounds [8 x i32], ptr @C, i64 0, i64 [[IDXPROM_I_1]]
+; APPLE-NEXT:    [[L_5_1:%.*]] = load i32, ptr [[GEP_C_1]], align 4
+; APPLE-NEXT:    br label %[[MERGE_11]]
+; APPLE:       [[MERGE_11]]:
+; APPLE-NEXT:    [[MERGE_1_1:%.*]] = phi i32 [ 0, %[[THEN_1]] ], [ [[L_4_1]], %[[ELSE_1]] ]
+; APPLE-NEXT:    [[MERGE_2_1:%.*]] = phi i32 [ 0, %[[THEN_1]] ], [ [[L_5_1]], %[[ELSE_1]] ]
+; APPLE-NEXT:    [[ADD14_1:%.*]] = add nsw i32 [[MERGE_2_1]], [[X]]
+; APPLE-NEXT:    [[MUL15_1:%.*]] = mul nsw i32 [[ADD14_1]], [[WIDTH]]
+; APPLE-NEXT:    [[TMP4:%.*]] = trunc nuw nsw i64 [[IV_NEXT_EPIL]] to i32
+; APPLE-NEXT:    [[ADD16_1:%.*]] = add nsw i32 [[MERGE_1_1]], [[TMP4]]
+; APPLE-NEXT:    [[ADD17_1:%.*]] = add nsw i32 [[ADD16_1]], [[MUL15_1]]
+; APPLE-NEXT:    [[IDXPROM18_1:%.*]] = sext i32 [[ADD17_1]] to i64
+; APPLE-NEXT:    [[GEP_P_2_1:%.*]] = getelementptr inbounds { i32, i8, i8, [2 x i8] }, ptr [[P_2]], i64 [[IDXPROM18_1]]
+; APPLE-NEXT:    [[L_6_1:%.*]] = load i32, ptr [[GEP_P_2_1]], align 4
+; APPLE-NEXT:    [[SUB_1:%.*]] = sub nsw i32 [[X]], [[MERGE_2_1]]
+; APPLE-NEXT:    [[MUL21_1:%.*]] = mul nsw i32 [[SUB_1]], [[WIDTH]]
+; APPLE-NEXT:    [[SUB22_1:%.*]] = sub i32 [[TMP4]], [[MERGE_1_1]]
+; APPLE-NEXT:    [[ADD23_1:%.*]] = add nsw i32 [[SUB22_1]], [[MUL21_1]]
+; APPLE-NEXT:    [[IDXPROM24_1:%.*]] = sext i32 [[ADD23_1]] to i64
+; APPLE-NEXT:    [[GEP_P2_1_1:%.*]] = getelementptr inbounds { i32, i8, i8, [2 x i8] }, ptr [[P_2]], i64 [[IDXPROM24_1]]
+; APPLE-NEXT:    [[L_7_1:%.*]] = load i32, ptr [[GEP_P2_1_1]], align 4
+; APPLE-NEXT:    [[C_3_1:%.*]] = icmp sgt i32 [[L_1_1]], [[L_6_1]]
+; APPLE-NEXT:    [[C_4_1:%.*]] = icmp sgt i32 [[L_1_1]], [[L_7_1]]
+; APPLE-NEXT:    [[AND34_1:%.*]] = and i1 [[C_3_1]], [[C_4_1]]
+; APPLE-NEXT:    br i1 [[AND34_1]], label %[[STORE_RES_1:.*]], label %[[LOOP_LATCH_1]]
+; APPLE:       [[STORE_RES_1]]:
+; APPLE-NEXT:    [[C_5_1:%.*]] = icmp sgt i32 [[L_1_1]], [[T_2]]
+; APPLE-NEXT:    [[GEP_5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_1]], i64 5
+; APPLE-NEXT:    [[RES_1:%.*]] = select i1 [[C_5_1]], i8 1, i8 2
+; APPLE-NEXT:    store i8 [[RES_1]], ptr [[GEP_5_1]], align 1
+; APPLE-NEXT:    br label %[[LOOP_LATCH_1]]
+; APPLE:       [[LOOP_LATCH_1]]:
+; APPLE-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
+; APPLE-NEXT:    [[GEP_2:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT:    [[L_1_2:%.*]] = load i32, ptr [[GEP_2]], align 4
+; APPLE-NEXT:    [[C_1_2:%.*]] = icmp sgt i32 [[L_1_2]], [[T_1]]
+; APPLE-NEXT:    br i1 [[C_1_2]], label %[[THEN_2:.*]], label %[[LOOP_LATCH_2:.*]]
+; APPLE:       [[THEN_2]]:
+; APPLE-NEXT:    [[GEP_4_2:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_2]], i64 4
+; APPLE-NEXT:    [[L_2_2:%.*]] = load i8, ptr [[GEP_4_2]], align 4
+; APPLE-NEXT:    [[C_2_2:%.*]] = icmp ugt i8 [[L_2_2]], 7
+; APPLE-NEXT:    br i1 [[C_2_2]], label %[[MERGE_22:.*]], label %[[ELSE_2:.*]]
+; APPLE:       [[ELSE_2]]:
+; APPLE-NEXT:    [[CONV_I_2:%.*]] = zext nneg i8 [[L_2_2]] to i64
+; APPLE-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds [9 x i8], ptr @A, i64 0, i64 [[CONV_I_2]]
+; APPLE-NEXT:    [[L_3_2:%.*]] = load i8, ptr [[GEP_A_2]], align 1
+; APPLE-NEXT:    [[IDXPROM_I_2:%.*]] = sext i8 [[L_3_2]] to i64
+; APPLE-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds [8 x i32], ptr @B, i64 0, i64 [[IDXPROM_I_2]]
+; ...
[truncated]

fhahn · 2024-12-12T13:59:51Z

Rebased after landing #118317.

Ping :)

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

jroelofs · 2024-12-17T19:36:26Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  };
+  CmpInst::Predicate Pred;
+  Instruction *I;
+  if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),


what about a DependsOnLoopLoad(I, 0) via the other ICmp operand, would that be canonicalized away, or should we care about that here?

For now I think most cases should be caught due to canonicalization. I'll look into extending it as follow up

jroelofs · 2024-12-17T19:38:59Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+    if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
+      return false;
+
+    if (auto *LI = dyn_cast<LoadInst>(I))


isa, since the result isn't used.

jroelofs

couple of nits/questions, but otherwise LGTM

Try to runtime-unroll loops with early-continues depending on loop-varying loads; this helps with branch-prediction for the early-continues and can significantly improve performance for such loops

…lvm#118499) Try to runtime-unroll loops with early-continues depending on loop-varying loads; this helps with branch-prediction for the early-continues and can significantly improve performance for such loops Builds on top of llvm#118317. PR: llvm#118499. (cherry picked from commit d486b76)

…Silicon. (#118499) Try to runtime-unroll loops with early-continues depending on loop-varying loads; this helps with branch-prediction for the early-continues and can significantly improve performance for such loops Builds on top of llvm/llvm-project#118317. PR: llvm/llvm-project#118499.

Local branch amd-gfx 577c5c7 Merged main:1b9805c14dba into amd-gfx:0cab57531d07 Remote branch main d486b76 [AArch64] Unroll some loops with early-continues on Apple Silicon. (llvm#118499)

fhahn requested review from aemerson, ahmedbougacha, juliannagele and jroelofs December 3, 2024 15:25

llvmbot added backend:AArch64 llvm:transforms labels Dec 3, 2024

fhahn force-pushed the rt-unroll-early-continue-load branch 2 times, most recently from 804196c to c2ab633 Compare December 12, 2024 13:59

jroelofs reviewed Dec 17, 2024

View reviewed changes

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp Show resolved Hide resolved

jroelofs reviewed Dec 17, 2024

View reviewed changes

jroelofs approved these changes Dec 17, 2024

View reviewed changes

fhahn added 2 commits December 21, 2024 15:57

[AArch64] Unroll some loops with early-continues on Apple Silicon.

570013b

Try to runtime-unroll loops with early-continues depending on loop-varying loads; this helps with branch-prediction for the early-continues and can significantly improve performance for such loops

!fixup fix

2fab33a

fhahn force-pushed the rt-unroll-early-continue-load branch from c2ab633 to 2fab33a Compare December 21, 2024 18:32

Merge branch 'main' into rt-unroll-early-continue-load

06cfbcb

fhahn merged commit d486b76 into llvm:main Dec 22, 2024
8 checks passed

fhahn deleted the rt-unroll-early-continue-load branch December 22, 2024 13:11

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Unroll some loops with early-continues on Apple Silicon. #118499

[AArch64] Unroll some loops with early-continues on Apple Silicon. #118499

Uh oh!

fhahn commented Dec 3, 2024

Uh oh!

llvmbot commented Dec 3, 2024

Uh oh!

llvmbot commented Dec 3, 2024

Uh oh!

fhahn commented Dec 12, 2024

Uh oh!

Uh oh!

jroelofs Dec 17, 2024

Uh oh!

fhahn Dec 22, 2024

Uh oh!

jroelofs Dec 17, 2024

Uh oh!

jroelofs left a comment

Uh oh!

Uh oh!

Uh oh!

[AArch64] Unroll some loops with early-continues on Apple Silicon. #118499

[AArch64] Unroll some loops with early-continues on Apple Silicon. #118499

Uh oh!

Conversation

fhahn commented Dec 3, 2024

Uh oh!

llvmbot commented Dec 3, 2024

Uh oh!

llvmbot commented Dec 3, 2024

Uh oh!

fhahn commented Dec 12, 2024

Uh oh!

Uh oh!

jroelofs Dec 17, 2024

Choose a reason for hiding this comment

Uh oh!

fhahn Dec 22, 2024

Choose a reason for hiding this comment

Uh oh!

jroelofs Dec 17, 2024

Choose a reason for hiding this comment

Uh oh!

jroelofs left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!