-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[LV] Use shl for (VFxUF * vscale) when creating minimum iter check. #153495
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Florian Hahn (fhahn) ChangesDirectly emit shl instead of a multiply if VFxUF is a power-of-2. The main motivation here is to prepare the code and test for directly generating and expanding a SCEV expression of the minimum iteration count. SCEVExpander will directly emit shl for multiplies with powers-of-2. InstCombine will also performs this combine, so end-to-end this should effectively by NFC. Patch is 206.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153495.diff 74 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2cee36003a39e..a54fb1ac4431a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2305,16 +2305,27 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
Type *CountTy = Count->getType();
Value *CheckMinIters = Builder.getFalse();
auto CreateStep = [&]() -> Value * {
+ ElementCount VFTimesUF = VF.multiplyCoefficientBy(UF);
+ Value *VFxUF = nullptr;
+ if (!VFTimesUF.isScalable() ||
+ !isPowerOf2_64(VFTimesUF.getKnownMinValue())) {
+ VFxUF = createStepForVF(Builder, CountTy, VF, UF);
+ } else {
+ VFxUF = Builder.CreateShl(
+ Builder.CreateVScale(CountTy),
+ ConstantInt::get(CountTy, Log2_64(VFTimesUF.getKnownMinValue())), "",
+ true);
+ }
+
// Create step with max(MinProTripCount, UF * VF).
- if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
- return createStepForVF(Builder, CountTy, VF, UF);
+ if (ElementCount::isKnownGE(VFTimesUF, MinProfitableTripCount))
+ return VFxUF;
Value *MinProfTC =
createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
if (!VF.isScalable())
return MinProfTC;
- return Builder.CreateBinaryIntrinsic(
- Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
+ return Builder.CreateBinaryIntrinsic(Intrinsic::umax, MinProfTC, VFxUF);
};
TailFoldingStyle Style = Cost->getTailFoldingStyle();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index bf72fea73d40b..4ecda056e6910 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -7,8 +7,8 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
; CHECK-SAME: ptr [[DST:%.*]], i32 [[X:%.*]], i64 [[M:%.*]], i64 [[CONV6:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[CONV61:%.*]] = zext i32 [[X]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP10]], 2
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
index 6a592edfc1d64..7363f86cdab7a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
@@ -10,7 +10,7 @@ define void @f1(ptr %A) #0 {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
index a8d0b37cac3c9..4ddf51b9bdd58 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
@@ -72,7 +72,7 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur
; SVE: for.body.preheader:
; SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; SVE: vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 3cef1f6e03ff9..b821593f7845f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -13,7 +13,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; DEFAULT-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
+; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; DEFAULT: [[VECTOR_MEMCHECK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
index 1c4b62183d939..20eb884261572 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -11,7 +11,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ITER_CHECK:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
@@ -147,7 +147,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ITER_CHECK:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
index fa8d17c5c28fc..e018b2016c915 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
@@ -137,7 +137,7 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
; INTERLEAVE-4-VLA-LABEL: @interleave_integer_reduction(
; INTERLEAVE-4-VLA-NEXT: entry:
; INTERLEAVE-4-VLA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-4-VLA-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
+; INTERLEAVE-4-VLA-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
; INTERLEAVE-4-VLA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
; INTERLEAVE-4-VLA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; INTERLEAVE-4-VLA: vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index bbc2e324941a1..635efad8c499a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -49,7 +49,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS1-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-VS1-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]]
; CHECK-VS1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
+; CHECK-VS1-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
; CHECK-VS1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
; CHECK-VS1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; CHECK-VS1: [[VECTOR_SCEVCHECK]]:
@@ -64,7 +64,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS1-NEXT: br i1 [[TMP13]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK-VS1: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-VS1-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 16
+; CHECK-VS1-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 4
; CHECK-VS1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], [[TMP15]]
; CHECK-VS1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK-VS1: [[VECTOR_PH]]:
@@ -149,7 +149,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS2-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-VS2-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]]
; CHECK-VS2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; CHECK-VS2-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
; CHECK-VS2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
; CHECK-VS2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; CHECK-VS2: [[VECTOR_SCEVCHECK]]:
@@ -164,7 +164,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS2-NEXT: br i1 [[TMP13]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK-VS2: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-VS2-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 8
+; CHECK-VS2-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 3
; CHECK-VS2-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], [[TMP15]]
; CHECK-VS2-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK-VS2: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 7028678b338f0..8e6ac48a5cbc8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -904,7 +904,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFNONE-NEXT: [[ENTRY:.*]]:
; TFNONE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; TFNONE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
+; TFNONE-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; TFNONE: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
index f284afc38788a..87a18ba2c18ea 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
@@ -10,7 +10,7 @@ define void @foo() {
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
@@ -32,22 +32,22 @@ define void @foo() {
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
; CHECK-NEXT: br label [[INNER_LOOP1:%.*]]
; CHECK: inner_loop1:
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP10:%.*]], [[INNER_LOOP1]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x float> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP9:%.*]], [[INNER_LOOP1]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <vscale x 4 x i64> [[VEC_PHI]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
-; CHECK-NEXT: [[TMP9]] = fmul <vscale x 4 x float> [[VEC_PHI2]], [[WIDE_MASKED_GATHER3]]
-; CHECK-NEXT: [[TMP10]] = add nuw nsw <vscale x 4 x i64> [[VEC_PHI]], splat (i64 1)
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <vscale x 4 x i64> [[TMP10]], splat (i64 512)
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <vscale x 4 x i1> [[TMP11]], i32 0
-; CHECK-NEXT: br i1 [[TMP12]], label [[VECTOR_LATCH]], label [[INNER_LOOP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP12:%.*]], [[INNER_LOOP1]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = phi <vscale x 4 x float> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP11:%.*]], [[INNER_LOOP1]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <vscale x 4 x i64> [[TMP8]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT: [[TMP11]] = fmul <vscale x 4 x float> [[TMP9]], [[WIDE_MASKED_GATHER2]]
+; CHECK-NEXT: [[TMP12]] = add nuw nsw <vscale x 4 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <vscale x 4 x i64> [[TMP12]], splat (i64 512)
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[TMP13]], i32 0
+; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_LATCH]], label [[INNER_LOOP1]]
; CHECK: vector.latch:
-; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x float> [ [[TMP9]], [[INNER_LOOP1]] ]
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VEC_PHI4]], <vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT: [[TMP15:%.*]] = phi <vscale x 4 x float> [ [[TMP11]], [[INNER_LOOP1]] ]
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[TMP15]], <vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> splat (i1 true))
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 7232fe5f019f2..330c22d736809 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -53,7 +53,7 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
@@ -94,7 +94,7 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
@@ -205,7 +205,7 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
@@ -246,7 +246,7 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
@@ -357,7 +357,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
@@ -398,7 +398,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-SVE-MA...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesDirectly emit shl instead of a multiply if VFxUF is a power-of-2. The main motivation here is to prepare the code and test for directly generating and expanding a SCEV expression of the minimum iteration count. SCEVExpander will directly emit shl for multiplies with powers-of-2. InstCombine will also performs this combine, so end-to-end this should effectively by NFC. Patch is 206.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153495.diff 74 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2cee36003a39e..a54fb1ac4431a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2305,16 +2305,27 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
Type *CountTy = Count->getType();
Value *CheckMinIters = Builder.getFalse();
auto CreateStep = [&]() -> Value * {
+ ElementCount VFTimesUF = VF.multiplyCoefficientBy(UF);
+ Value *VFxUF = nullptr;
+ if (!VFTimesUF.isScalable() ||
+ !isPowerOf2_64(VFTimesUF.getKnownMinValue())) {
+ VFxUF = createStepForVF(Builder, CountTy, VF, UF);
+ } else {
+ VFxUF = Builder.CreateShl(
+ Builder.CreateVScale(CountTy),
+ ConstantInt::get(CountTy, Log2_64(VFTimesUF.getKnownMinValue())), "",
+ true);
+ }
+
// Create step with max(MinProTripCount, UF * VF).
- if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
- return createStepForVF(Builder, CountTy, VF, UF);
+ if (ElementCount::isKnownGE(VFTimesUF, MinProfitableTripCount))
+ return VFxUF;
Value *MinProfTC =
createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
if (!VF.isScalable())
return MinProfTC;
- return Builder.CreateBinaryIntrinsic(
- Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
+ return Builder.CreateBinaryIntrinsic(Intrinsic::umax, MinProfTC, VFxUF);
};
TailFoldingStyle Style = Cost->getTailFoldingStyle();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index bf72fea73d40b..4ecda056e6910 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -7,8 +7,8 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
; CHECK-SAME: ptr [[DST:%.*]], i32 [[X:%.*]], i64 [[M:%.*]], i64 [[CONV6:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[CONV61:%.*]] = zext i32 [[X]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP10]], 2
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
index 6a592edfc1d64..7363f86cdab7a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
@@ -10,7 +10,7 @@ define void @f1(ptr %A) #0 {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
index a8d0b37cac3c9..4ddf51b9bdd58 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
@@ -72,7 +72,7 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur
; SVE: for.body.preheader:
; SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; SVE: vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 3cef1f6e03ff9..b821593f7845f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -13,7 +13,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; DEFAULT-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
+; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; DEFAULT: [[VECTOR_MEMCHECK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
index 1c4b62183d939..20eb884261572 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -11,7 +11,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ITER_CHECK:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
@@ -147,7 +147,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ITER_CHECK:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
index fa8d17c5c28fc..e018b2016c915 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
@@ -137,7 +137,7 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
; INTERLEAVE-4-VLA-LABEL: @interleave_integer_reduction(
; INTERLEAVE-4-VLA-NEXT: entry:
; INTERLEAVE-4-VLA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-4-VLA-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
+; INTERLEAVE-4-VLA-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
; INTERLEAVE-4-VLA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
; INTERLEAVE-4-VLA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; INTERLEAVE-4-VLA: vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index bbc2e324941a1..635efad8c499a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -49,7 +49,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS1-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-VS1-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]]
; CHECK-VS1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
+; CHECK-VS1-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
; CHECK-VS1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
; CHECK-VS1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; CHECK-VS1: [[VECTOR_SCEVCHECK]]:
@@ -64,7 +64,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS1-NEXT: br i1 [[TMP13]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK-VS1: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-VS1-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 16
+; CHECK-VS1-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 4
; CHECK-VS1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], [[TMP15]]
; CHECK-VS1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK-VS1: [[VECTOR_PH]]:
@@ -149,7 +149,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS2-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-VS2-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]]
; CHECK-VS2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; CHECK-VS2-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
; CHECK-VS2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
; CHECK-VS2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; CHECK-VS2: [[VECTOR_SCEVCHECK]]:
@@ -164,7 +164,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS2-NEXT: br i1 [[TMP13]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK-VS2: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-VS2-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 8
+; CHECK-VS2-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 3
; CHECK-VS2-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], [[TMP15]]
; CHECK-VS2-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK-VS2: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 7028678b338f0..8e6ac48a5cbc8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -904,7 +904,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFNONE-NEXT: [[ENTRY:.*]]:
; TFNONE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; TFNONE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
+; TFNONE-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; TFNONE: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
index f284afc38788a..87a18ba2c18ea 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
@@ -10,7 +10,7 @@ define void @foo() {
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
@@ -32,22 +32,22 @@ define void @foo() {
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
; CHECK-NEXT: br label [[INNER_LOOP1:%.*]]
; CHECK: inner_loop1:
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP10:%.*]], [[INNER_LOOP1]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x float> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP9:%.*]], [[INNER_LOOP1]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <vscale x 4 x i64> [[VEC_PHI]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
-; CHECK-NEXT: [[TMP9]] = fmul <vscale x 4 x float> [[VEC_PHI2]], [[WIDE_MASKED_GATHER3]]
-; CHECK-NEXT: [[TMP10]] = add nuw nsw <vscale x 4 x i64> [[VEC_PHI]], splat (i64 1)
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <vscale x 4 x i64> [[TMP10]], splat (i64 512)
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <vscale x 4 x i1> [[TMP11]], i32 0
-; CHECK-NEXT: br i1 [[TMP12]], label [[VECTOR_LATCH]], label [[INNER_LOOP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP12:%.*]], [[INNER_LOOP1]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = phi <vscale x 4 x float> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP11:%.*]], [[INNER_LOOP1]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <vscale x 4 x i64> [[TMP8]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT: [[TMP11]] = fmul <vscale x 4 x float> [[TMP9]], [[WIDE_MASKED_GATHER2]]
+; CHECK-NEXT: [[TMP12]] = add nuw nsw <vscale x 4 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <vscale x 4 x i64> [[TMP12]], splat (i64 512)
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[TMP13]], i32 0
+; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_LATCH]], label [[INNER_LOOP1]]
; CHECK: vector.latch:
-; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x float> [ [[TMP9]], [[INNER_LOOP1]] ]
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VEC_PHI4]], <vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT: [[TMP15:%.*]] = phi <vscale x 4 x float> [ [[TMP11]], [[INNER_LOOP1]] ]
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[TMP15]], <vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> splat (i1 true))
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 7232fe5f019f2..330c22d736809 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -53,7 +53,7 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
@@ -94,7 +94,7 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
@@ -205,7 +205,7 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
@@ -246,7 +246,7 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
@@ -357,7 +357,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
@@ -398,7 +398,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-SVE-MA...
[truncated]
|
@llvm/pr-subscribers-vectorizers Author: Florian Hahn (fhahn) ChangesDirectly emit shl instead of a multiply if VFxUF is a power-of-2. The main motivation here is to prepare the code and test for directly generating and expanding a SCEV expression of the minimum iteration count. SCEVExpander will directly emit shl for multiplies with powers-of-2. InstCombine will also performs this combine, so end-to-end this should effectively by NFC. Patch is 206.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153495.diff 74 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2cee36003a39e..a54fb1ac4431a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2305,16 +2305,27 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
Type *CountTy = Count->getType();
Value *CheckMinIters = Builder.getFalse();
auto CreateStep = [&]() -> Value * {
+ ElementCount VFTimesUF = VF.multiplyCoefficientBy(UF);
+ Value *VFxUF = nullptr;
+ if (!VFTimesUF.isScalable() ||
+ !isPowerOf2_64(VFTimesUF.getKnownMinValue())) {
+ VFxUF = createStepForVF(Builder, CountTy, VF, UF);
+ } else {
+ VFxUF = Builder.CreateShl(
+ Builder.CreateVScale(CountTy),
+ ConstantInt::get(CountTy, Log2_64(VFTimesUF.getKnownMinValue())), "",
+ true);
+ }
+
// Create step with max(MinProTripCount, UF * VF).
- if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
- return createStepForVF(Builder, CountTy, VF, UF);
+ if (ElementCount::isKnownGE(VFTimesUF, MinProfitableTripCount))
+ return VFxUF;
Value *MinProfTC =
createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
if (!VF.isScalable())
return MinProfTC;
- return Builder.CreateBinaryIntrinsic(
- Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
+ return Builder.CreateBinaryIntrinsic(Intrinsic::umax, MinProfTC, VFxUF);
};
TailFoldingStyle Style = Cost->getTailFoldingStyle();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index bf72fea73d40b..4ecda056e6910 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -7,8 +7,8 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
; CHECK-SAME: ptr [[DST:%.*]], i32 [[X:%.*]], i64 [[M:%.*]], i64 [[CONV6:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[CONV61:%.*]] = zext i32 [[X]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP10]], 2
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
index 6a592edfc1d64..7363f86cdab7a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
@@ -10,7 +10,7 @@ define void @f1(ptr %A) #0 {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
index a8d0b37cac3c9..4ddf51b9bdd58 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
@@ -72,7 +72,7 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur
; SVE: for.body.preheader:
; SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; SVE: vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 3cef1f6e03ff9..b821593f7845f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -13,7 +13,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; DEFAULT-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
+; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; DEFAULT: [[VECTOR_MEMCHECK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
index 1c4b62183d939..20eb884261572 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -11,7 +11,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ITER_CHECK:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
@@ -147,7 +147,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ITER_CHECK:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
index fa8d17c5c28fc..e018b2016c915 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
@@ -137,7 +137,7 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
; INTERLEAVE-4-VLA-LABEL: @interleave_integer_reduction(
; INTERLEAVE-4-VLA-NEXT: entry:
; INTERLEAVE-4-VLA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-4-VLA-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
+; INTERLEAVE-4-VLA-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
; INTERLEAVE-4-VLA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
; INTERLEAVE-4-VLA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; INTERLEAVE-4-VLA: vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index bbc2e324941a1..635efad8c499a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -49,7 +49,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS1-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-VS1-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]]
; CHECK-VS1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
+; CHECK-VS1-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
; CHECK-VS1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
; CHECK-VS1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; CHECK-VS1: [[VECTOR_SCEVCHECK]]:
@@ -64,7 +64,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS1-NEXT: br i1 [[TMP13]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK-VS1: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-VS1-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 16
+; CHECK-VS1-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 4
; CHECK-VS1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], [[TMP15]]
; CHECK-VS1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK-VS1: [[VECTOR_PH]]:
@@ -149,7 +149,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS2-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-VS2-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]]
; CHECK-VS2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; CHECK-VS2-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
; CHECK-VS2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
; CHECK-VS2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; CHECK-VS2: [[VECTOR_SCEVCHECK]]:
@@ -164,7 +164,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS2-NEXT: br i1 [[TMP13]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK-VS2: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-VS2-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 8
+; CHECK-VS2-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 3
; CHECK-VS2-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], [[TMP15]]
; CHECK-VS2-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK-VS2: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 7028678b338f0..8e6ac48a5cbc8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -904,7 +904,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFNONE-NEXT: [[ENTRY:.*]]:
; TFNONE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; TFNONE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
+; TFNONE-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; TFNONE: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
index f284afc38788a..87a18ba2c18ea 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
@@ -10,7 +10,7 @@ define void @foo() {
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
@@ -32,22 +32,22 @@ define void @foo() {
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
; CHECK-NEXT: br label [[INNER_LOOP1:%.*]]
; CHECK: inner_loop1:
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP10:%.*]], [[INNER_LOOP1]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x float> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP9:%.*]], [[INNER_LOOP1]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <vscale x 4 x i64> [[VEC_PHI]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
-; CHECK-NEXT: [[TMP9]] = fmul <vscale x 4 x float> [[VEC_PHI2]], [[WIDE_MASKED_GATHER3]]
-; CHECK-NEXT: [[TMP10]] = add nuw nsw <vscale x 4 x i64> [[VEC_PHI]], splat (i64 1)
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <vscale x 4 x i64> [[TMP10]], splat (i64 512)
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <vscale x 4 x i1> [[TMP11]], i32 0
-; CHECK-NEXT: br i1 [[TMP12]], label [[VECTOR_LATCH]], label [[INNER_LOOP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP12:%.*]], [[INNER_LOOP1]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = phi <vscale x 4 x float> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP11:%.*]], [[INNER_LOOP1]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <vscale x 4 x i64> [[TMP8]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT: [[TMP11]] = fmul <vscale x 4 x float> [[TMP9]], [[WIDE_MASKED_GATHER2]]
+; CHECK-NEXT: [[TMP12]] = add nuw nsw <vscale x 4 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <vscale x 4 x i64> [[TMP12]], splat (i64 512)
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[TMP13]], i32 0
+; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_LATCH]], label [[INNER_LOOP1]]
; CHECK: vector.latch:
-; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x float> [ [[TMP9]], [[INNER_LOOP1]] ]
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VEC_PHI4]], <vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT: [[TMP15:%.*]] = phi <vscale x 4 x float> [ [[TMP11]], [[INNER_LOOP1]] ]
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[TMP15]], <vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> splat (i1 true))
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 7232fe5f019f2..330c22d736809 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -53,7 +53,7 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
@@ -94,7 +94,7 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
@@ -205,7 +205,7 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
@@ -246,7 +246,7 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
@@ -357,7 +357,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
@@ -398,7 +398,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-SVE-MA...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we also get the same thing if we instantiate the IRBuilder above with IRBuilder<InstSimplifyFolder>
? I presume if it's limited to just this part of createIterationCountCheck
then the IR should be well formed.
If I understand correctly we should probably be moving the IRBuilder to use the TargetFolder, at least for constant folding. See comment by @nikic on #150931. This is something I've considered doing in the loop vectoriser and vplan. |
InstSimplifyFolder can't convert mul -> shl. |
Yep, InstSimplifyFolder can only fold to an existing instruction, so it cannot create a new SHL instruction |
55f96a3
to
9fd0cf2
Compare
IRBuilder<InstSimplifyFolder> Builder( | ||
TCCheckBlock->getContext(), | ||
InstSimplifyFolder(TCCheckBlock->getDataLayout())); | ||
Builder.SetInsertPoint(TCCheckBlock->getTerminator()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note: this helps to remove some redundant shl x, 0
instructions
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
if (!VF.isScalable() || !isPowerOf2_64(VF.getKnownMinValue()) || | ||
!isPowerOf2_64(Step)) | ||
return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); | ||
|
||
return B.CreateShl( | ||
B.CreateVScale(Ty), | ||
ConstantInt::get(Ty, Log2_64((VF * Step).getKnownMinValue())), "", true); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor nit, would it be slightly easier to read to early return in the optimised case, i.e.
if (!VF.isScalable() || !isPowerOf2_64(VF.getKnownMinValue()) || | |
!isPowerOf2_64(Step)) | |
return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); | |
return B.CreateShl( | |
B.CreateVScale(Ty), | |
ConstantInt::get(Ty, Log2_64((VF * Step).getKnownMinValue())), "", true); | |
if (VF.isScalable() && isPowerOf2_64(VF.getKnownMinValue()) && | |
isPowerOf2_64(Step)) | |
return B.CreateShl( | |
B.CreateVScale(Ty), | |
ConstantInt::get(Ty, Log2_64((VF * Step).getKnownMinValue())), "", true); | |
return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); |
Not opinionated about this, up to you
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated thanks
return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); | ||
if (!VF.isScalable() || !isPowerOf2_64(VF.getKnownMinValue()) || | ||
!isPowerOf2_64(Step)) | ||
return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Given you're effectively doing the same thing below you could just do:
ElementCount VFxStep = VF.multiplyCoefficientBy(Step);
if (!VF.isScalable() || !isPowerOf2_64(VF.getKnownMinValue()) ||
!isPowerOf2_64(Step))
return B.CreateElementCount(Ty, VFxStep);
return B.CreateShl(
B.CreateVScale(Ty),
ConstantInt::get(Ty, Log2_64(VFxStep.getKnownMinValue())), "", true);
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
@@ -824,7 +824,13 @@ namespace llvm { | |||
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, | |||
int64_t Step) { | |||
assert(Ty->isIntegerTy() && "Expected an integer step"); | |||
return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); | |||
if (!VF.isScalable() || !isPowerOf2_64(VF.getKnownMinValue()) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Do we need the check for isPowerOf2_64(VF.getKnownMinValue())
? I thought the vectoriser rejected non-power-of-2 VFs? I just wonder if this can be an assert instead.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was one user that didn't pass an actual VF, but that was easy to update to use IRBuilder::CreateElementCount.
I added the assert
9fd0cf2
to
2bb76fa
Compare
if (VF.isScalable() && isPowerOf2_64(Step)) { | ||
return B.CreateShl( | ||
B.CreateVScale(Ty), | ||
ConstantInt::get(Ty, Log2_64((VF * Step).getKnownMinValue())), "", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like it's not reusing VFxStep
- I think it got lost from the previous commit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes, added back, thanks!
Directly emit shl instead of a multiply if VFxUF is a power-of-2. The main motivation here is to prepare the code and test for directly generating and expanding a SCEV expression of the minimum iteration count. SCEVExpander will directly emit shl for multiplies with powers-of-2. InstCombine will also performs this combine, so end-to-end this should effectively by NFC.
669216b
to
d34f0bf
Compare
…. (#153495) Directly emit shl instead of a multiply if VF * Step is a power-of-2. The main motivation here is to prepare the code and test for directly generating and expanding a SCEV expression of the minimum iteration count. SCEVExpander will directly emit shl for multiplies with powers-of-2. InstCombine will also performs this combine, so end-to-end this should effectively by NFC. PR: llvm/llvm-project#153495
Directly emit shl instead of a multiply if VFxUF is a power-of-2. The main motivation here is to prepare the code and test for directly generating and expanding a SCEV expression of the minimum iteration count. SCEVExpander will directly emit shl for multiplies with powers-of-2.
InstCombine will also performs this combine, so end-to-end this should effectively by NFC.