Skip to content

Commit 6e8dc4a

Browse files
committed
[LV] Change loops' interleave count computation (llvm#73766)
[LV] Change loops' interleave count computation A set of microbenchmarks in llvm-test-suite (llvm/llvm-test-suite#56), when tested on a AArch64 platform, demonstrates that loop interleaving is beneficial when the vector loop runs at least twice or when the epilogue loop trip count (TC) is minimal. Therefore, we choose interleaving count (IC) between TC/VF & TC/2*VF (VF = vectorization factor), such that remainder TC for the epilogue loop is minimum while the IC is maximum in case the remainder TC is same for both. The initial tests for this change were submitted in PRs: llvm#70272 and llvm#74689.
1 parent 95997b3 commit 6e8dc4a

File tree

8 files changed

+469
-544
lines changed

8 files changed

+469
-544
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5903,21 +5903,45 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
59035903
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
59045904
}
59055905

5906-
// If trip count is known or estimated compile time constant, limit the
5907-
// interleave count to be less than the trip count divided by VF, provided it
5908-
// is at least 1.
5909-
//
5910-
// For scalable vectors we can't know if interleaving is beneficial. It may
5911-
// not be beneficial for small loops if none of the lanes in the second vector
5912-
// iterations is enabled. However, for larger loops, there is likely to be a
5913-
// similar benefit as for fixed-width vectors. For now, we choose to leave
5914-
// the InterleaveCount as if vscale is '1', although if some information about
5915-
// the vector is known (e.g. min vector size), we can make a better decision.
5916-
if (BestKnownTC) {
5917-
MaxInterleaveCount =
5918-
std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5919-
// Make sure MaxInterleaveCount is greater than 0.
5920-
MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5906+
unsigned EstimatedVF = VF.getKnownMinValue();
5907+
if (VF.isScalable()) {
5908+
if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5909+
EstimatedVF *= *VScale;
5910+
}
5911+
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5912+
5913+
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5914+
if (KnownTC) {
5915+
// If trip count is known we select between two prospective ICs, where
5916+
// 1) the aggressive IC is capped by the trip count divided by VF
5917+
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
5918+
// The final IC is selected in a way that the epilogue loop trip count is
5919+
// minimized while maximizing the IC itself, so that we either run the
5920+
// vector loop at least once if it generates a small epilogue loop, or else
5921+
// we run the vector loop at least twice.
5922+
5923+
unsigned InterleaveCountUB = bit_floor(
5924+
std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5925+
unsigned InterleaveCountLB = bit_floor(std::max(
5926+
1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5927+
MaxInterleaveCount = InterleaveCountLB;
5928+
5929+
if (InterleaveCountUB != InterleaveCountLB) {
5930+
unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5931+
unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5932+
// If both produce same scalar tail, maximize the IC to do the same work
5933+
// in fewer vector loop iterations
5934+
if (TailTripCountUB == TailTripCountLB)
5935+
MaxInterleaveCount = InterleaveCountUB;
5936+
}
5937+
} else if (BestKnownTC) {
5938+
// If trip count is an estimated compile time constant, limit the
5939+
// IC to be capped by the trip count divided by VF * 2, such that the vector
5940+
// loop runs at least twice to make interleaving seem profitable when there
5941+
// is an epilogue loop present. Since exact Trip count is not known we
5942+
// choose to be conservative in our IC estimate.
5943+
MaxInterleaveCount = bit_floor(std::max(
5944+
1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
59215945
}
59225946

59235947
assert(MaxInterleaveCount > 0 &&

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ target triple = "aarch64-linux-gnu"
55

66
%pair = type { i8, i8 }
77

8-
; TODO: For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
8+
; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
99
; it should conservatively choose IC 1 so that the vector loop runs twice at least
10-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
10+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
1111
define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
1212
entry:
1313
br label %for.body
@@ -29,9 +29,9 @@ for.end:
2929
ret void
3030
}
3131

32-
; TODO: For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
32+
; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
3333
; it should conservatively choose IC 1 so that the vector loop runs twice at least
34-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
34+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
3535
define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
3636
entry:
3737
br label %for.body
@@ -53,9 +53,9 @@ for.end:
5353
ret void
5454
}
5555

56-
; TODO: For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
56+
; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
5757
; it should conservatively choose IC 1 so that the vector loop runs twice at least
58-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
58+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
5959
define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
6060
entry:
6161
br label %for.body
@@ -77,9 +77,9 @@ for.end:
7777
ret void
7878
}
7979

80-
; TODO: For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
80+
; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
8181
; it should conservatively choose IC 1 so that the vector loop runs twice at least
82-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
82+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
8383
define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
8484
entry:
8585
br label %for.body
@@ -101,9 +101,9 @@ for.end:
101101
ret void
102102
}
103103

104-
; TODO: For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
104+
; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
105105
; it should choose conservatively IC 2 so that the vector loop runs twice at least
106-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
106+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
107107
define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
108108
entry:
109109
br label %for.body
@@ -125,9 +125,9 @@ for.end:
125125
ret void
126126
}
127127

128-
; TODO: For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
128+
; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
129129
; it should choose conservatively IC 2 so that the vector loop runs twice at least
130-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 6)
130+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
131131
define void @loop_with_profile_tc_100(ptr noalias %p, ptr noalias %q, i64 %n) {
132132
entry:
133133
br label %for.body
@@ -149,9 +149,9 @@ for.end:
149149
ret void
150150
}
151151

152-
; TODO: For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
152+
; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
153153
; it should choose conservatively IC 4 so that the vector loop runs twice at least
154-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
154+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
155155
define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
156156
entry:
157157
br label %for.body
@@ -173,9 +173,9 @@ for.end:
173173
ret void
174174
}
175175

176-
; TODO: For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
176+
; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
177177
; it should choose conservatively IC 4 so that the vector loop runs twice at least
178-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
178+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
179179
define void @loop_with_profile_tc_129(ptr noalias %p, ptr noalias %q, i64 %n) {
180180
entry:
181181
br label %for.body
@@ -197,9 +197,9 @@ for.end:
197197
ret void
198198
}
199199

200-
; TODO: For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
200+
; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
201201
; it should choose conservatively IC 4 so that the vector loop runs twice at least
202-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
202+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
203203
define void @loop_with_profile_tc_180(ptr noalias %p, ptr noalias %q, i64 %n) {
204204
entry:
205205
br label %for.body
@@ -221,9 +221,9 @@ for.end:
221221
ret void
222222
}
223223

224-
; TODO: For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
224+
; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
225225
; it should choose conservatively IC 4 so that the vector loop runs twice at least
226-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
226+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
227227
define void @loop_with_profile_tc_193(ptr noalias %p, ptr noalias %q, i64 %n) {
228228
entry:
229229
br label %for.body
@@ -245,7 +245,7 @@ for.end:
245245
ret void
246246
}
247247

248-
; TODO: For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
248+
; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
249249
; the IC will be capped by the target-specific maximum interleave count
250250
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
251251
define void @loop_with_profile_tc_1000(ptr noalias %p, ptr noalias %q, i64 %n) {

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@ for.end:
7777
ret void
7878
}
7979

80-
; TODO: For this loop with known TC of 48, when the auto-vectorizer chooses VF 16, it should choose
80+
; For this loop with known TC of 48, when the auto-vectorizer chooses VF 16, it should choose
8181
; IC 1 since there will be no remainder loop that needs to run after the vector loop.
82-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
82+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
8383
define void @loop_with_tc_48(ptr noalias %p, ptr noalias %q) {
8484
entry:
8585
br label %for.body
@@ -101,9 +101,9 @@ for.end:
101101
ret void
102102
}
103103

104-
; TODO: For this loop with known TC of 49, when the auto-vectorizer chooses VF 16, it should choose
104+
; For this loop with known TC of 49, when the auto-vectorizer chooses VF 16, it should choose
105105
; IC 1 since a remainder loop TC of 1 is more efficient than remainder loop TC of 17 with IC 2
106-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
106+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
107107
define void @loop_with_tc_49(ptr noalias %p, ptr noalias %q) {
108108
entry:
109109
br label %for.body
@@ -125,9 +125,9 @@ for.end:
125125
ret void
126126
}
127127

128-
; TODO: For this loop with known TC of 55, when the auto-vectorizer chooses VF 16, it should choose
128+
; For this loop with known TC of 55, when the auto-vectorizer chooses VF 16, it should choose
129129
; IC 1 since a remainder loop TC of 7 is more efficient than remainder loop TC of 23 with IC 2
130-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
130+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
131131
define void @loop_with_tc_55(ptr noalias %p, ptr noalias %q) {
132132
entry:
133133
br label %for.body
@@ -149,9 +149,9 @@ for.end:
149149
ret void
150150
}
151151

152-
; TODO: For this loop with known TC of 100, when the auto-vectorizer chooses VF 16, it should choose
152+
; For this loop with known TC of 100, when the auto-vectorizer chooses VF 16, it should choose
153153
; IC 2 since a remainder loop TC of 4 is more efficient than remainder loop TC of 36 with IC 4
154-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 6)
154+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
155155
define void @loop_with_tc_100(ptr noalias %p, ptr noalias %q) {
156156
entry:
157157
br label %for.body
@@ -245,9 +245,9 @@ for.end:
245245
ret void
246246
}
247247

248-
; TODO: For this loop with known TC of 193, when the auto-vectorizer chooses VF 16, it should choose
248+
; For this loop with known TC of 193, when the auto-vectorizer chooses VF 16, it should choose
249249
; IC 4 since a remainder loop TC of 1 is more efficient than remainder loop TC of 65 with IC 8
250-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
250+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
251251
define void @loop_with_tc_193(ptr noalias %p, ptr noalias %q) {
252252
entry:
253253
br label %for.body

llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,6 @@
88
; CHECK-NEXT: fadd
99
; CHECK-NEXT: fadd
1010
; CHECK-NEXT: fadd
11-
; CHECK-NEXT: fadd
12-
; CHECK-NEXT: fadd
13-
; CHECK-NEXT: fadd
14-
; CHECK-NEXT: fadd
1511
; CHECK-NEXT: =
1612
; CHECK-NOT: fadd
1713
; CHECK-SAME: >

0 commit comments

Comments
 (0)