Skip to content

Commit c492eb6

Browse files
authored
[LV] Update interleaving count computation when scalar epilogue loop needs to run at least once (#79651)
Update loop interleaving count computation to address loops that require at least one scalar iteration in the epilogue loop. For this case, the available trip count for interleaving the loop is one less.
1 parent 9a1ca24 commit c492eb6

File tree

3 files changed

+31
-18
lines changed

3 files changed

+31
-18
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5436,7 +5436,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
54365436
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
54375437

54385438
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5439-
if (KnownTC) {
5439+
if (KnownTC > 0) {
5440+
// At least one iteration must be scalar when this constraint holds. So the
5441+
// maximum available iterations for interleaving is one less.
5442+
unsigned AvailableTC =
5443+
requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5444+
54405445
// If trip count is known we select between two prospective ICs, where
54415446
// 1) the aggressive IC is capped by the trip count divided by VF
54425447
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
@@ -5446,27 +5451,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
54465451
// we run the vector loop at least twice.
54475452

54485453
unsigned InterleaveCountUB = bit_floor(
5449-
std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5454+
std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
54505455
unsigned InterleaveCountLB = bit_floor(std::max(
5451-
1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5456+
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
54525457
MaxInterleaveCount = InterleaveCountLB;
54535458

54545459
if (InterleaveCountUB != InterleaveCountLB) {
5455-
unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5456-
unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5460+
unsigned TailTripCountUB =
5461+
(AvailableTC % (EstimatedVF * InterleaveCountUB));
5462+
unsigned TailTripCountLB =
5463+
(AvailableTC % (EstimatedVF * InterleaveCountLB));
54575464
// If both produce same scalar tail, maximize the IC to do the same work
54585465
// in fewer vector loop iterations
54595466
if (TailTripCountUB == TailTripCountLB)
54605467
MaxInterleaveCount = InterleaveCountUB;
54615468
}
5462-
} else if (BestKnownTC) {
5469+
} else if (BestKnownTC && *BestKnownTC > 0) {
5470+
// At least one iteration must be scalar when this constraint holds. So the
5471+
// maximum available iterations for interleaving is one less.
5472+
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5473+
? (*BestKnownTC) - 1
5474+
: *BestKnownTC;
5475+
54635476
// If trip count is an estimated compile time constant, limit the
54645477
// IC to be capped by the trip count divided by VF * 2, such that the vector
54655478
// loop runs at least twice to make interleaving seem profitable when there
54665479
// is an epilogue loop present. Since exact Trip count is not known we
54675480
// choose to be conservative in our IC estimate.
54685481
MaxInterleaveCount = bit_floor(std::max(
5469-
1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5482+
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
54705483
}
54715484

54725485
assert(MaxInterleaveCount > 0 &&

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,9 @@ for.end:
129129
; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
130130
; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
131131
; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
132-
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
132+
; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
133133
; remainder than IC 2
134-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
134+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
135135
define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
136136
entry:
137137
br label %for.body
@@ -211,17 +211,17 @@ for.end:
211211
; the resulting interleaved group in this case may access memory out-of-bounds, it requires
212212
; a scalar epilogue iteration for correctness, making at most 127 iterations available for
213213
; interleaving.
214-
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
214+
; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
215215
; remainder than IC 4
216-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
216+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
217217
define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
218218
; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
219219
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
220220
; CHECK-IR-NEXT: iter.check:
221221
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
222222
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
223223
; CHECK-IR: vector.main.loop.iter.check:
224-
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
224+
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 32
225225
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
226226
;
227227
entry:

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ for.end:
3333
; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group
3434
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
3535
; correctness, making at most 31 iterations available for interleaving.
36-
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
36+
; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
3737
; than IC 2
38-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
38+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
3939
define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
4040
entry:
4141
br label %for.body
@@ -229,15 +229,15 @@ for.end:
229229
; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group
230230
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
231231
; correctness, making at most 127 iterations available for interleaving.
232-
; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
233-
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
232+
; Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
233+
; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
234234
; remainder than IC 4
235-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
235+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
236236
define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
237237
; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
238238
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
239239
; CHECK-IR-NEXT: entry:
240-
; CHECK-IR-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
240+
; CHECK-IR-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
241241
entry:
242242
br label %for.body
243243

0 commit comments

Comments
 (0)