Skip to content

Commit 41a3828

Browse files
authored
[LV] Added pre-commit tests for changing loop interleaving count computation (#74689)
Added more pre-commit tests for evaluating changes to loop interleaving count computation in (#73766). The new set of tests address the change in IC computation to minimize the remainder TC of the vectorized loop while maximizing the IC when the remainder TC is the same.
1 parent dc55703 commit 41a3828

File tree

3 files changed

+576
-107
lines changed

3 files changed

+576
-107
lines changed

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll

Lines changed: 0 additions & 107 deletions
This file was deleted.
Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2+
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
3+
4+
target triple = "aarch64-linux-gnu"
5+
6+
%pair = type { i8, i8 }
7+
8+
; TODO: For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
9+
; it should conservatively choose IC 1 so that the vector loop runs twice at least
10+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
11+
define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
12+
entry:
13+
br label %for.body
14+
15+
for.body:
16+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
17+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
18+
%tmp1 = load i8, ptr %tmp0, align 1
19+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
20+
%tmp3 = load i8, ptr %tmp2, align 1
21+
%add = add i8 %tmp1, %tmp3
22+
%qi = getelementptr i8, ptr %q, i64 %i
23+
store i8 %add, ptr %qi, align 1
24+
%i.next = add nuw nsw i64 %i, 1
25+
%cond = icmp eq i64 %i.next, %n
26+
br i1 %cond, label %for.end, label %for.body, !prof !0
27+
28+
for.end:
29+
ret void
30+
}
31+
32+
; TODO: For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
33+
; it should conservatively choose IC 1 so that the vector loop runs twice at least
34+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
35+
define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
36+
entry:
37+
br label %for.body
38+
39+
for.body:
40+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
41+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
42+
%tmp1 = load i8, ptr %tmp0, align 1
43+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
44+
%tmp3 = load i8, ptr %tmp2, align 1
45+
%add = add i8 %tmp1, %tmp3
46+
%qi = getelementptr i8, ptr %q, i64 %i
47+
store i8 %add, ptr %qi, align 1
48+
%i.next = add nuw nsw i64 %i, 1
49+
%cond = icmp eq i64 %i.next, %n
50+
br i1 %cond, label %for.end, label %for.body, !prof !1
51+
52+
for.end:
53+
ret void
54+
}
55+
56+
; TODO: For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
57+
; it should conservatively choose IC 1 so that the vector loop runs twice at least
58+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
59+
define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
60+
entry:
61+
br label %for.body
62+
63+
for.body:
64+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
65+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
66+
%tmp1 = load i8, ptr %tmp0, align 1
67+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
68+
%tmp3 = load i8, ptr %tmp2, align 1
69+
%add = add i8 %tmp1, %tmp3
70+
%qi = getelementptr i8, ptr %q, i64 %i
71+
store i8 %add, ptr %qi, align 1
72+
%i.next = add nuw nsw i64 %i, 1
73+
%cond = icmp eq i64 %i.next, %n
74+
br i1 %cond, label %for.end, label %for.body, !prof !2
75+
76+
for.end:
77+
ret void
78+
}
79+
80+
; TODO: For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
81+
; it should conservatively choose IC 1 so that the vector loop runs twice at least
82+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
83+
define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
84+
entry:
85+
br label %for.body
86+
87+
for.body:
88+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
89+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
90+
%tmp1 = load i8, ptr %tmp0, align 1
91+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
92+
%tmp3 = load i8, ptr %tmp2, align 1
93+
%add = add i8 %tmp1, %tmp3
94+
%qi = getelementptr i8, ptr %q, i64 %i
95+
store i8 %add, ptr %qi, align 1
96+
%i.next = add nuw nsw i64 %i, 1
97+
%cond = icmp eq i64 %i.next, %n
98+
br i1 %cond, label %for.end, label %for.body, !prof !3
99+
100+
for.end:
101+
ret void
102+
}
103+
104+
; TODO: For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
105+
; it should choose conservatively IC 2 so that the vector loop runs twice at least
106+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
107+
define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
108+
entry:
109+
br label %for.body
110+
111+
for.body:
112+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
113+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
114+
%tmp1 = load i8, ptr %tmp0, align 1
115+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
116+
%tmp3 = load i8, ptr %tmp2, align 1
117+
%add = add i8 %tmp1, %tmp3
118+
%qi = getelementptr i8, ptr %q, i64 %i
119+
store i8 %add, ptr %qi, align 1
120+
%i.next = add nuw nsw i64 %i, 1
121+
%cond = icmp eq i64 %i.next, %n
122+
br i1 %cond, label %for.end, label %for.body, !prof !4
123+
124+
for.end:
125+
ret void
126+
}
127+
128+
; TODO: For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
129+
; it should choose conservatively IC 2 so that the vector loop runs twice at least
130+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 6)
131+
define void @loop_with_profile_tc_100(ptr noalias %p, ptr noalias %q, i64 %n) {
132+
entry:
133+
br label %for.body
134+
135+
for.body:
136+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
137+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
138+
%tmp1 = load i8, ptr %tmp0, align 1
139+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
140+
%tmp3 = load i8, ptr %tmp2, align 1
141+
%add = add i8 %tmp1, %tmp3
142+
%qi = getelementptr i8, ptr %q, i64 %i
143+
store i8 %add, ptr %qi, align 1
144+
%i.next = add nuw nsw i64 %i, 1
145+
%cond = icmp eq i64 %i.next, %n
146+
br i1 %cond, label %for.end, label %for.body, !prof !5
147+
148+
for.end:
149+
ret void
150+
}
151+
152+
; TODO: For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
153+
; it should choose conservatively IC 4 so that the vector loop runs twice at least
154+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
155+
define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
156+
entry:
157+
br label %for.body
158+
159+
for.body:
160+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
161+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
162+
%tmp1 = load i8, ptr %tmp0, align 1
163+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
164+
%tmp3 = load i8, ptr %tmp2, align 1
165+
%add = add i8 %tmp1, %tmp3
166+
%qi = getelementptr i8, ptr %q, i64 %i
167+
store i8 %add, ptr %qi, align 1
168+
%i.next = add nuw nsw i64 %i, 1
169+
%cond = icmp eq i64 %i.next, %n
170+
br i1 %cond, label %for.end, label %for.body, !prof !6
171+
172+
for.end:
173+
ret void
174+
}
175+
176+
; TODO: For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
177+
; it should choose conservatively IC 4 so that the vector loop runs twice at least
178+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
179+
define void @loop_with_profile_tc_129(ptr noalias %p, ptr noalias %q, i64 %n) {
180+
entry:
181+
br label %for.body
182+
183+
for.body:
184+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
185+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
186+
%tmp1 = load i8, ptr %tmp0, align 1
187+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
188+
%tmp3 = load i8, ptr %tmp2, align 1
189+
%add = add i8 %tmp1, %tmp3
190+
%qi = getelementptr i8, ptr %q, i64 %i
191+
store i8 %add, ptr %qi, align 1
192+
%i.next = add nuw nsw i64 %i, 1
193+
%cond = icmp eq i64 %i.next, %n
194+
br i1 %cond, label %for.end, label %for.body, !prof !7
195+
196+
for.end:
197+
ret void
198+
}
199+
200+
; TODO: For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
201+
; it should choose conservatively IC 4 so that the vector loop runs twice at least
202+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
203+
define void @loop_with_profile_tc_180(ptr noalias %p, ptr noalias %q, i64 %n) {
204+
entry:
205+
br label %for.body
206+
207+
for.body:
208+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
209+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
210+
%tmp1 = load i8, ptr %tmp0, align 1
211+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
212+
%tmp3 = load i8, ptr %tmp2, align 1
213+
%add = add i8 %tmp1, %tmp3
214+
%qi = getelementptr i8, ptr %q, i64 %i
215+
store i8 %add, ptr %qi, align 1
216+
%i.next = add nuw nsw i64 %i, 1
217+
%cond = icmp eq i64 %i.next, %n
218+
br i1 %cond, label %for.end, label %for.body, !prof !8
219+
220+
for.end:
221+
ret void
222+
}
223+
224+
; TODO: For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
225+
; it should choose conservatively IC 4 so that the vector loop runs twice at least
226+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
227+
define void @loop_with_profile_tc_193(ptr noalias %p, ptr noalias %q, i64 %n) {
228+
entry:
229+
br label %for.body
230+
231+
for.body:
232+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
233+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
234+
%tmp1 = load i8, ptr %tmp0, align 1
235+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
236+
%tmp3 = load i8, ptr %tmp2, align 1
237+
%add = add i8 %tmp1, %tmp3
238+
%qi = getelementptr i8, ptr %q, i64 %i
239+
store i8 %add, ptr %qi, align 1
240+
%i.next = add nuw nsw i64 %i, 1
241+
%cond = icmp eq i64 %i.next, %n
242+
br i1 %cond, label %for.end, label %for.body, !prof !9
243+
244+
for.end:
245+
ret void
246+
}
247+
248+
; TODO: For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
249+
; the IC will be capped by the target-specific maximum interleave count
250+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
251+
define void @loop_with_profile_tc_1000(ptr noalias %p, ptr noalias %q, i64 %n) {
252+
entry:
253+
br label %for.body
254+
255+
for.body:
256+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
257+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
258+
%tmp1 = load i8, ptr %tmp0, align 1
259+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
260+
%tmp3 = load i8, ptr %tmp2, align 1
261+
%add = add i8 %tmp1, %tmp3
262+
%qi = getelementptr i8, ptr %q, i64 %i
263+
store i8 %add, ptr %qi, align 1
264+
%i.next = add nuw nsw i64 %i, 1
265+
%cond = icmp eq i64 %i.next, %n
266+
br i1 %cond, label %for.end, label %for.body, !prof !10
267+
268+
for.end:
269+
ret void
270+
}
271+
272+
!0 = !{!"branch_weights", i32 1, i32 31}
273+
!1 = !{!"branch_weights", i32 1, i32 32}
274+
!2 = !{!"branch_weights", i32 1, i32 47}
275+
!3 = !{!"branch_weights", i32 1, i32 62}
276+
!4 = !{!"branch_weights", i32 1, i32 63}
277+
!5 = !{!"branch_weights", i32 1, i32 99}
278+
!6 = !{!"branch_weights", i32 1, i32 127}
279+
!7 = !{!"branch_weights", i32 1, i32 128}
280+
!8 = !{!"branch_weights", i32 1, i32 179}
281+
!9 = !{!"branch_weights", i32 1, i32 192}
282+
!10 = !{!"branch_weights", i32 1, i32 999}

0 commit comments

Comments
 (0)