diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d50f3c0c3f3e0..c6e09c4f2e6ee 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -772,6 +772,105 @@ sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, return true; } +/// Try to hoist \p Previous and its operands before all users of \p FOR. +static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, + VPRecipeBase *Previous, + VPDominatorTree &VPDT) { + if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory()) + return false; + + // Collect recipes that need hoisting. + SmallVector HoistCandidates; + SmallPtrSet Visited; + VPRecipeBase *HoistPoint = nullptr; + // Find the closest hoist point by looking at all users of FOR and selecting + // the recipe dominating all other users. + for (VPUser *U : FOR->users()) { + auto *R = dyn_cast(U); + if (!R) + continue; + if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint)) + HoistPoint = R; + } + assert(all_of(FOR->users(), + [&VPDT, HoistPoint](VPUser *U) { + auto *R = dyn_cast(U); + return !R || HoistPoint == R || + VPDT.properlyDominates(HoistPoint, R); + }) && + "HoistPoint must dominate all users of FOR"); + + auto NeedsHoisting = [HoistPoint, &VPDT, + &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * { + VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe(); + if (!HoistCandidate) + return nullptr; + VPRegionBlock *EnclosingLoopRegion = + HoistCandidate->getParent()->getEnclosingLoopRegion(); + assert((!HoistCandidate->getParent()->getParent() || + HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) && + "CFG in VPlan should still be flat, without replicate regions"); + // Hoist candidate was already visited, no need to hoist. + if (!Visited.insert(HoistCandidate).second) + return nullptr; + + // Candidate is outside loop region or a header phi, dominates FOR users w/o + // hoisting. + if (!EnclosingLoopRegion || isa(HoistCandidate)) + return nullptr; + + // If we reached a recipe that dominates HoistPoint, we don't need to + // hoist the recipe. + if (VPDT.properlyDominates(HoistCandidate, HoistPoint)) + return nullptr; + return HoistCandidate; + }; + auto CanHoist = [&](VPRecipeBase *HoistCandidate) { + // Avoid hoisting candidates with side-effects, as we do not yet analyze + // associated dependencies. + return !HoistCandidate->mayHaveSideEffects(); + }; + + if (!NeedsHoisting(Previous->getVPSingleValue())) + return true; + + // Recursively try to hoist Previous and its operands before all users of FOR. + HoistCandidates.push_back(Previous); + + for (unsigned I = 0; I != HoistCandidates.size(); ++I) { + VPRecipeBase *Current = HoistCandidates[I]; + assert(Current->getNumDefinedValues() == 1 && + "only recipes with a single defined value expected"); + if (!CanHoist(Current)) + return false; + + for (VPValue *Op : Current->operands()) { + // If we reach FOR, it means the original Previous depends on some other + // recurrence that in turn depends on FOR. If that is the case, we would + // also need to hoist recipes involving the other FOR, which may break + // dependencies. + if (Op == FOR) + return false; + + if (auto *R = NeedsHoisting(Op)) + HoistCandidates.push_back(R); + } + } + + // Order recipes to hoist by dominance so earlier instructions are processed + // first. + sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) { + return VPDT.properlyDominates(A, B); + }); + + for (VPRecipeBase *HoistCandidate : HoistCandidates) { + HoistCandidate->moveBefore(*HoistPoint->getParent(), + HoistPoint->getIterator()); + } + + return true; +} + bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &LoopBuilder) { VPDominatorTree VPDT; @@ -795,7 +894,8 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe(); } - if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT)) + if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) && + !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT)) return false; // Introduce a recipe to combine the incoming and previous values of a diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 60a44bfb0dca6..11e094db6294f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -36,11 +36,11 @@ struct VPlanTransforms { GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI); - /// Sink users of fixed-order recurrences after the recipe defining their - /// previous value. Then introduce FirstOrderRecurrenceSplice VPInstructions - /// to combine the value from the recurrence phis and previous values. The - /// current implementation assumes all users can be sunk after the previous - /// value, which is enforced by earlier legality checks. + /// Try to have all users of fixed-order recurrences appear after the recipe + /// defining their previous value, by either sinking users or hoisting recipes + /// defining their previous value (and its operands). Then introduce + /// FirstOrderRecurrenceSplice VPInstructions to combine the value from the + /// recurrence phis and previous values. /// \returns true if all users of fixed-order recurrences could be re-arranged /// as needed or false if it is not possible. In the latter case, \p Plan is /// not valid. diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll index dcdc723b4b9b9..ca38d27477600 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll @@ -278,3 +278,79 @@ exit: store double %.lcssa, ptr %C ret i64 %.in.lcssa } + +; Test for https://github.com/llvm/llvm-project/issues/106523. +; %for.2 requires no code motion, as its previous (%or) precedes its (first) +; user (store). Furthermore, its user cannot sink, being a store. +; +; %for.1 requires code motion, as its previous (%trunc) follows its (first) +; user (%or). Sinking %or past %trunc seems possible, as %or has no uses +; (except for feeding %for.2; worth strengthening VPlan's dce?). However, %or +; is both the user of %for.1 and the previous of %for.2, and we refrain from +; sinking instructions that act as previous because they (may) serve points to +; sink after. + +; Instead, %for.1 can be reconciled by hoisting its previous above its user +; %or, as this user %trunc depends only on %iv. +define void @for_iv_trunc_optimized(ptr %dst) { +; CHECK-LABEL: @for_iv_trunc_optimized( +; CHECK-NEXT: bb: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP3]] = or <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; CHECK-NEXT: store i32 [[TMP6]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 336 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 337, [[MIDDLE_BLOCK]] ], [ 1, [[BB:%.*]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[BB]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FOR_1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TRUNC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FOR_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[OR]] = or i32 [[FOR_1]], 3 +; CHECK-NEXT: [[ADD]] = add i64 [[IV]], 1 +; CHECK-NEXT: store i32 [[FOR_2]], ptr [[DST]], align 4 +; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[IV]], 337 +; CHECK-NEXT: [[TRUNC]] = trunc i64 [[IV]] to i32 +; CHECK-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +bb: + br label %loop + +loop: + %iv = phi i64 [ 1, %bb ], [ %add, %loop ] + %for.1 = phi i32 [ 1, %bb ], [ %trunc, %loop ] + %for.2 = phi i32 [ 0, %bb ], [ %or, %loop ] + %or = or i32 %for.1, 3 + %add = add i64 %iv, 1 + store i32 %for.2, ptr %dst, align 4 + %icmp = icmp ult i64 %iv, 337 + %trunc = trunc i64 %iv to i32 + br i1 %icmp, label %loop, label %exit + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll index 9de675b285309..fe48008792ff7 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll @@ -147,14 +147,57 @@ exit: } ; This test has two FORs (for.x and for.y) where incoming value from the previous -; iteration (for.x.prev) of one FOR (for.y) depends on another FOR (for.x). Due to -; this dependency all uses of the former FOR (for.y) should be sunk after -; incoming value from the previous iteration (for.x.prev) of te latter FOR (for.y). -; That means side-effecting user (store i64 %for.y.i64, ptr %gep) of the latter -; FOR (for.y) should be moved which is not currently supported. +; iteration (for.x.prev) of one FOR (for.y) depends on another FOR (for.x). +; Sinking would require moving a recipe with side effects (store). Instead, +; for.x.next can be hoisted. define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) { ; CHECK-LABEL: 'test_chained_first_order_recurrences_4' -; CHECK: No VPlans built. +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<4098> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: WIDEN ir<%for.x.next> = mul ir<%x>, ir<2> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.x> = phi ir<0>, ir<%for.x.next> +; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.y> = phi ir<0>, ir<%for.x.prev> +; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%base>, vp<[[SCALAR_STEPS]]> +; CHECK-NEXT: EMIT vp<[[SPLICE_X:%.]]> = first-order splice ir<%for.x>, ir<%for.x.next> +; CHECK-NEXT: WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32 +; CHECK-NEXT: EMIT vp<[[SPLICE_Y:%.+]]> = first-order splice ir<%for.y>, ir<%for.x.prev> +; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64 +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[EXT_X:%.+]]> = extract-from-end ir<%for.x.next>, ir<1> +; CHECK-NEXT: EMIT vp<[[EXT_Y:%.+]]>.1 = extract-from-end ir<%for.x.prev>, ir<1> +; CHECK-NEXT: EMIT vp<[[MIDDLE_C:%.+]]> = icmp eq ir<4098>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_C]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0> +; CHECK-NEXT: EMIT vp<[[RESUME_Y:%.+]]>.1 = resume-phi vp<[[EXT_Y]]>.1, ir<0> +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out i64 %for.x = vp<[[RESUME_X]]> +; CHECK-NEXT: Live-out i32 %for.y = vp<[[RESUME_Y]]>.1 +; CHECK-NEXT: } ; entry: br label %loop @@ -178,7 +221,54 @@ ret: define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) { ; CHECK-LABEL: 'test_chained_first_order_recurrences_5_hoist_to_load' -; CHECK: No VPlans built. +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<4098> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.x> = phi ir<0>, ir<%for.x.next> +; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.y> = phi ir<0>, ir<%for.x.prev> +; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%base>, vp<[[SCALAR_STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: WIDEN ir<%for.x.next> = mul ir<%l>, ir<2> +; CHECK-NEXT: EMIT vp<[[SPLICE_X:%.]]> = first-order splice ir<%for.x>, ir<%for.x.next> +; CHECK-NEXT: WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32 +; CHECK-NEXT: EMIT vp<[[SPLICE_Y:%.+]]> = first-order splice ir<%for.y>, ir<%for.x.prev> +; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64 +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[EXT_X:%.+]]> = extract-from-end ir<%for.x.next>, ir<1> +; CHECK-NEXT: EMIT vp<[[EXT_Y:%.+]]>.1 = extract-from-end ir<%for.x.prev>, ir<1> +; CHECK-NEXT: EMIT vp<[[MIDDLE_C:%.+]]> = icmp eq ir<4098>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_C]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0> +; CHECK-NEXT: EMIT vp<[[RESUME_Y:%.+]]>.1 = resume-phi vp<[[EXT_Y]]>.1, ir<0> +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out i64 %for.x = vp<[[RESUME_X]]> +; CHECK-NEXT: Live-out i32 %for.y = vp<[[RESUME_Y]]>.1 +; CHECK-NEXT: } ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll index dbe373b46cce2..20acc10535323 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll @@ -123,9 +123,9 @@ define void @test_pr54223_sink_after_insertion_order(ptr noalias %a, ptr noalias ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT5:%.*]] = phi float [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT5:%.*]] = phi float [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -246,9 +246,9 @@ define void @test_pr54233_for_depend_on_each_other(ptr noalias %a, ptr noalias % ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -382,22 +382,57 @@ exit: ret void } -define void @hoist_previous_value_and_operands(ptr %dst, i64 %mask) { -; CHECK-LABEL: @hoist_previous_value_and_operands( +; Similar to the test cases for https://github.com/llvm/llvm-project/issues/106523. +; The previous truncation (%trunc) gets vectorized (rather than folded into an +; IV) and hoisted along with its AND operand above the user 'or'. +define void @hoist_previous_value_and_operand(ptr %dst, i64 %mask) { +; CHECK-LABEL: @hoist_previous_value_and_operand( ; CHECK-NEXT: bb: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MASK:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = and <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4]] = trunc <4 x i64> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6]] = or <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR1]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 336 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3 +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 337, [[MIDDLE_BLOCK]] ], [ 1, [[BB:%.*]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[BB]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[ADD:%.*]], [[LOOP]] ], [ 1, [[BB:%.*]] ] -; CHECK-NEXT: [[FOR_1:%.*]] = phi i32 [ [[TRUNC:%.*]], [[LOOP]] ], [ 1, [[BB]] ] -; CHECK-NEXT: [[FOR_2:%.*]] = phi i32 [ [[OR:%.*]], [[LOOP]] ], [ 0, [[BB]] ] -; CHECK-NEXT: [[OR]] = or i32 [[FOR_1]], 0 +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FOR_1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TRUNC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FOR_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[OR]] = or i32 [[FOR_1]], 3 ; CHECK-NEXT: [[ADD]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[IV]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: store i32 [[FOR_2]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[IV]], 337 -; CHECK-NEXT: [[A:%.*]] = and i64 [[IV]], [[MASK:%.*]] +; CHECK-NEXT: [[A:%.*]] = and i64 [[IV]], [[MASK]] ; CHECK-NEXT: [[TRUNC]] = trunc i64 [[A]] to i32 -; CHECK-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -405,10 +440,10 @@ bb: br label %loop loop: - %iv = phi i64 [ %add, %loop ], [ 1, %bb ] - %for.1 = phi i32 [ %trunc, %loop ], [ 1, %bb ] - %for.2 = phi i32 [ %or, %loop ], [ 0, %bb ] - %or = or i32 %for.1, 0 + %iv = phi i64 [ 1, %bb ], [ %add, %loop ] + %for.1 = phi i32 [ 1, %bb ], [ %trunc, %loop ] + %for.2 = phi i32 [ 0, %bb ], [ %or, %loop ] + %or = or i32 %for.1, 3 %add = add i64 %iv, 1 %gep = getelementptr inbounds i32, ptr %dst, i64 %iv store i32 %for.2, ptr %gep, align 4