Skip to content

Commit a07f03f

Browse files
committed
[AArch64] Add a test for LSR chain generation with vscale increments. NFC
1 parent b8b2a01 commit a07f03f

File tree

1 file changed

+243
-0
lines changed

1 file changed

+243
-0
lines changed
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple aarch64 -mattr=+sve2 -o - %s | FileCheck %s
3+
4+
define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float noundef nofpclass(nan inf) %kernel_factor, ptr %call5.i.i.i119) vscale_range(1, 16) {
5+
; CHECK-LABEL: test:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: cmp w1, #1
8+
; CHECK-NEXT: b.lt .LBB0_6
9+
; CHECK-NEXT: // %bb.1: // %for.body.lr.ph
10+
; CHECK-NEXT: rdvl x8, #-2
11+
; CHECK-NEXT: mov w9, #608 // =0x260
12+
; CHECK-NEXT: ands x11, x8, x9
13+
; CHECK-NEXT: b.eq .LBB0_6
14+
; CHECK-NEXT: // %bb.2: // %for.body.us.preheader
15+
; CHECK-NEXT: ptrue p0.h
16+
; CHECK-NEXT: add x11, x2, x11, lsl #1
17+
; CHECK-NEXT: mov x12, #-16 // =0xfffffffffffffff0
18+
; CHECK-NEXT: ptrue p1.b
19+
; CHECK-NEXT: mov w8, wzr
20+
; CHECK-NEXT: mov x9, xzr
21+
; CHECK-NEXT: mov w10, wzr
22+
; CHECK-NEXT: addvl x12, x12, #1
23+
; CHECK-NEXT: mov x13, #4 // =0x4
24+
; CHECK-NEXT: mov x14, #8 // =0x8
25+
; CHECK-NEXT: .LBB0_3: // %for.body.us
26+
; CHECK-NEXT: // =>This Loop Header: Depth=1
27+
; CHECK-NEXT: // Child Loop BB0_4 Depth 2
28+
; CHECK-NEXT: add x15, x0, x9, lsl #2
29+
; CHECK-NEXT: sbfiz x16, x8, #1, #32
30+
; CHECK-NEXT: mov x17, x2
31+
; CHECK-NEXT: ldp s0, s1, [x15]
32+
; CHECK-NEXT: add x16, x16, #8
33+
; CHECK-NEXT: ldp s2, s3, [x15, #8]
34+
; CHECK-NEXT: ubfiz x15, x8, #1, #32
35+
; CHECK-NEXT: fcvt h0, s0
36+
; CHECK-NEXT: fcvt h1, s1
37+
; CHECK-NEXT: fcvt h2, s2
38+
; CHECK-NEXT: fcvt h3, s3
39+
; CHECK-NEXT: mov z0.h, h0
40+
; CHECK-NEXT: mov z1.h, h1
41+
; CHECK-NEXT: mov z2.h, h2
42+
; CHECK-NEXT: mov z3.h, h3
43+
; CHECK-NEXT: .LBB0_4: // %for.cond.i.preheader.us
44+
; CHECK-NEXT: // Parent Loop BB0_3 Depth=1
45+
; CHECK-NEXT: // => This Inner Loop Header: Depth=2
46+
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x17, x15]
47+
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17]
48+
; CHECK-NEXT: add x18, x17, x16
49+
; CHECK-NEXT: add x3, x17, x15
50+
; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h
51+
; CHECK-NEXT: ld1b { z5.b }, p1/z, [x17, x16]
52+
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h
53+
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x13, lsl #1]
54+
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h
55+
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x14, lsl #1]
56+
; CHECK-NEXT: add x18, x18, #16
57+
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
58+
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, #1, mul vl]
59+
; CHECK-NEXT: st1h { z4.h }, p0, [x17]
60+
; CHECK-NEXT: ld1h { z4.h }, p0/z, [x3, #1, mul vl]
61+
; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h
62+
; CHECK-NEXT: ld1b { z5.b }, p1/z, [x18, x12]
63+
; CHECK-NEXT: add x18, x18, x12
64+
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h
65+
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x13, lsl #1]
66+
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h
67+
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x14, lsl #1]
68+
; CHECK-NEXT: add x18, x18, #16
69+
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
70+
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, #2, mul vl]
71+
; CHECK-NEXT: st1h { z4.h }, p0, [x17, #1, mul vl]
72+
; CHECK-NEXT: ld1h { z4.h }, p0/z, [x3, #2, mul vl]
73+
; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h
74+
; CHECK-NEXT: ld1b { z5.b }, p1/z, [x18, x12]
75+
; CHECK-NEXT: add x18, x18, x12
76+
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h
77+
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x13, lsl #1]
78+
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h
79+
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x14, lsl #1]
80+
; CHECK-NEXT: add x18, x18, #16
81+
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
82+
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, #3, mul vl]
83+
; CHECK-NEXT: st1h { z4.h }, p0, [x17, #2, mul vl]
84+
; CHECK-NEXT: ld1h { z4.h }, p0/z, [x3, #3, mul vl]
85+
; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h
86+
; CHECK-NEXT: ld1b { z5.b }, p1/z, [x18, x12]
87+
; CHECK-NEXT: add x18, x18, x12
88+
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h
89+
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x13, lsl #1]
90+
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h
91+
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x14, lsl #1]
92+
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
93+
; CHECK-NEXT: st1h { z4.h }, p0, [x17, #3, mul vl]
94+
; CHECK-NEXT: addvl x17, x17, #4
95+
; CHECK-NEXT: cmp x17, x11
96+
; CHECK-NEXT: b.lo .LBB0_4
97+
; CHECK-NEXT: // %bb.5: // %while.cond.i..exit_crit_edge.us
98+
; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
99+
; CHECK-NEXT: add w10, w10, #1
100+
; CHECK-NEXT: add x9, x9, #4
101+
; CHECK-NEXT: add w8, w8, #16
102+
; CHECK-NEXT: cmp w10, w1
103+
; CHECK-NEXT: b.ne .LBB0_3
104+
; CHECK-NEXT: .LBB0_6: // %exit78
105+
; CHECK-NEXT: ret
106+
entry:
107+
;%call5.i.i.i119 = tail call noalias noundef nonnull dereferenceable(1248) ptr @_Znwm(i64 noundef 1248) #7
108+
%cmp139 = icmp sgt i32 %kw, 0
109+
;tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 2 dereferenceable(1248) %call5.i.i.i119, i8 0, i64 1248, i1 false)
110+
br i1 %cmp139, label %for.body.lr.ph, label %exit78
111+
112+
for.body.lr.ph: ; preds = %entry
113+
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
114+
%vscale = tail call i64 @llvm.vscale.i64()
115+
%mul5.i = shl nuw nsw i64 %vscale, 5
116+
%sub.not.i = sub nsw i64 0, %mul5.i
117+
%sub6.i = and i64 %sub.not.i, 608
118+
%add.ptr.i = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %sub6.i
119+
%cmp.i133.not = icmp eq i64 %sub6.i, 0
120+
%vs2 = shl nuw nsw i64 %vscale, 4
121+
br i1 %cmp.i133.not, label %exit78, label %for.body.us.preheader
122+
123+
for.body.us.preheader: ; preds = %for.body.lr.ph
124+
%.idx.i.us.2 = shl nuw nsw i64 %vscale, 5
125+
%.idx.i.us.3 = mul nuw nsw i64 %vscale, 48
126+
br label %for.body.us
127+
128+
for.body.us: ; preds = %for.body.us.preheader, %while.cond.i..exit_crit_edge.us
129+
%indvars.iv = phi i64 [ 0, %for.body.us.preheader ], [ %indvars.iv.next, %while.cond.i..exit_crit_edge.us ]
130+
%i4.0140.us = phi i32 [ 0, %for.body.us.preheader ], [ %inc.us, %while.cond.i..exit_crit_edge.us ]
131+
%3 = trunc nuw nsw i64 %indvars.iv to i32
132+
%mul6.us = shl i32 %3, 2
133+
%idx.ext.us = zext nneg i32 %mul6.us to i64
134+
%add.ptr.us = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %idx.ext.us
135+
%mul11.us = or disjoint i32 %mul6.us, 4
136+
%idx.ext12.us = sext i32 %mul11.us to i64
137+
%add.ptr13.us = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %idx.ext12.us
138+
%mul18.us = or disjoint i32 %mul6.us, 8
139+
%idx.ext19.us = sext i32 %mul18.us to i64
140+
%add.ptr20.us = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %idx.ext19.us
141+
%mul25.us = or disjoint i32 %mul6.us, 12
142+
%idx.ext26.us = sext i32 %mul25.us to i64
143+
%add.ptr27.us = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %idx.ext26.us
144+
%add.ptr29.us = getelementptr inbounds float, ptr %kernel, i64 %indvars.iv
145+
%4 = load float, ptr %add.ptr29.us, align 4
146+
%5 = fptrunc float %4 to half
147+
%.splatinsert.i.us = insertelement <vscale x 8 x half> poison, half %5, i64 0
148+
%6 = shufflevector <vscale x 8 x half> %.splatinsert.i.us, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
149+
%arrayidx2.i.us = getelementptr inbounds i8, ptr %add.ptr29.us, i64 4
150+
%7 = load float, ptr %arrayidx2.i.us, align 4
151+
%8 = fptrunc float %7 to half
152+
%.splatinsert57.i.us = insertelement <vscale x 8 x half> poison, half %8, i64 0
153+
%9 = shufflevector <vscale x 8 x half> %.splatinsert57.i.us, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
154+
%arrayidx3.i.us = getelementptr inbounds i8, ptr %add.ptr29.us, i64 8
155+
%10 = load float, ptr %arrayidx3.i.us, align 4
156+
%11 = fptrunc float %10 to half
157+
%.splatinsert58.i.us = insertelement <vscale x 8 x half> poison, half %11, i64 0
158+
%12 = shufflevector <vscale x 8 x half> %.splatinsert58.i.us, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
159+
%arrayidx4.i.us = getelementptr inbounds i8, ptr %add.ptr29.us, i64 12
160+
%13 = load float, ptr %arrayidx4.i.us, align 4
161+
%14 = fptrunc float %13 to half
162+
%.splatinsert59.i.us = insertelement <vscale x 8 x half> poison, half %14, i64 0
163+
%15 = shufflevector <vscale x 8 x half> %.splatinsert59.i.us, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
164+
br label %for.cond.i.preheader.us
165+
166+
for.cond.i.preheader.us: ; preds = %for.body.us, %for.cond.i.preheader.us
167+
%vdst.0.i138.us = phi ptr [ %call5.i.i.i119, %for.body.us ], [ %add.ptr15.i.us, %for.cond.i.preheader.us ]
168+
%s1.0.i137.us = phi ptr [ %add.ptr.us, %for.body.us ], [ %add.ptr16.i.us, %for.cond.i.preheader.us ]
169+
%s2.0.i136.us = phi ptr [ %add.ptr13.us, %for.body.us ], [ %add.ptr17.i.us, %for.cond.i.preheader.us ]
170+
%s3.0.i135.us = phi ptr [ %add.ptr20.us, %for.body.us ], [ %add.ptr18.i.us, %for.cond.i.preheader.us ]
171+
%s4.0.i134.us = phi ptr [ %add.ptr27.us, %for.body.us ], [ %add.ptr19.i.us, %for.cond.i.preheader.us ]
172+
%16 = load <vscale x 8 x half>, ptr %s1.0.i137.us, align 16
173+
%17 = load <vscale x 8 x half>, ptr %vdst.0.i138.us, align 16
174+
%18 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %17, <vscale x 8 x half> %16, <vscale x 8 x half> %6)
175+
%19 = load <vscale x 8 x half>, ptr %s2.0.i136.us, align 16
176+
%20 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %18, <vscale x 8 x half> %19, <vscale x 8 x half> %9)
177+
%21 = load <vscale x 8 x half>, ptr %s3.0.i135.us, align 16
178+
%22 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %20, <vscale x 8 x half> %21, <vscale x 8 x half> %12)
179+
%23 = load <vscale x 8 x half>, ptr %s4.0.i134.us, align 16
180+
%24 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %22, <vscale x 8 x half> %23, <vscale x 8 x half> %15)
181+
store <vscale x 8 x half> %24, ptr %vdst.0.i138.us, align 16
182+
%25 = getelementptr i8, ptr %s1.0.i137.us, i64 %vs2
183+
%26 = load <vscale x 8 x half>, ptr %25, align 16
184+
%27 = getelementptr i8, ptr %vdst.0.i138.us, i64 %vs2
185+
%28 = load <vscale x 8 x half>, ptr %27, align 16
186+
%29 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %28, <vscale x 8 x half> %26, <vscale x 8 x half> %6)
187+
%30 = getelementptr i8, ptr %s2.0.i136.us, i64 %vs2
188+
%31 = load <vscale x 8 x half>, ptr %30, align 16
189+
%32 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %29, <vscale x 8 x half> %31, <vscale x 8 x half> %9)
190+
%33 = getelementptr i8, ptr %s3.0.i135.us, i64 %vs2
191+
%34 = load <vscale x 8 x half>, ptr %33, align 16
192+
%35 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %32, <vscale x 8 x half> %34, <vscale x 8 x half> %12)
193+
%36 = getelementptr i8, ptr %s4.0.i134.us, i64 %vs2
194+
%37 = load <vscale x 8 x half>, ptr %36, align 16
195+
%38 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %35, <vscale x 8 x half> %37, <vscale x 8 x half> %15)
196+
store <vscale x 8 x half> %38, ptr %27, align 16
197+
%39 = getelementptr i8, ptr %s1.0.i137.us, i64 %.idx.i.us.2
198+
%40 = load <vscale x 8 x half>, ptr %39, align 16
199+
%41 = getelementptr i8, ptr %vdst.0.i138.us, i64 %.idx.i.us.2
200+
%42 = load <vscale x 8 x half>, ptr %41, align 16
201+
%43 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %42, <vscale x 8 x half> %40, <vscale x 8 x half> %6)
202+
%44 = getelementptr i8, ptr %s2.0.i136.us, i64 %.idx.i.us.2
203+
%45 = load <vscale x 8 x half>, ptr %44, align 16
204+
%46 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %43, <vscale x 8 x half> %45, <vscale x 8 x half> %9)
205+
%47 = getelementptr i8, ptr %s3.0.i135.us, i64 %.idx.i.us.2
206+
%48 = load <vscale x 8 x half>, ptr %47, align 16
207+
%49 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %46, <vscale x 8 x half> %48, <vscale x 8 x half> %12)
208+
%50 = getelementptr i8, ptr %s4.0.i134.us, i64 %.idx.i.us.2
209+
%51 = load <vscale x 8 x half>, ptr %50, align 16
210+
%52 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %49, <vscale x 8 x half> %51, <vscale x 8 x half> %15)
211+
store <vscale x 8 x half> %52, ptr %41, align 16
212+
%53 = getelementptr i8, ptr %s1.0.i137.us, i64 %.idx.i.us.3
213+
%54 = load <vscale x 8 x half>, ptr %53, align 16
214+
%55 = getelementptr i8, ptr %vdst.0.i138.us, i64 %.idx.i.us.3
215+
%56 = load <vscale x 8 x half>, ptr %55, align 16
216+
%57 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %56, <vscale x 8 x half> %54, <vscale x 8 x half> %6)
217+
%58 = getelementptr i8, ptr %s2.0.i136.us, i64 %.idx.i.us.3
218+
%59 = load <vscale x 8 x half>, ptr %58, align 16
219+
%60 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %57, <vscale x 8 x half> %59, <vscale x 8 x half> %9)
220+
%61 = getelementptr i8, ptr %s3.0.i135.us, i64 %.idx.i.us.3
221+
%62 = load <vscale x 8 x half>, ptr %61, align 16
222+
%63 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %60, <vscale x 8 x half> %62, <vscale x 8 x half> %12)
223+
%64 = getelementptr i8, ptr %s4.0.i134.us, i64 %.idx.i.us.3
224+
%65 = load <vscale x 8 x half>, ptr %64, align 16
225+
%66 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %63, <vscale x 8 x half> %65, <vscale x 8 x half> %15)
226+
store <vscale x 8 x half> %66, ptr %55, align 16
227+
%add.ptr15.i.us = getelementptr inbounds half, ptr %vdst.0.i138.us, i64 %mul5.i
228+
%add.ptr16.i.us = getelementptr inbounds half, ptr %s1.0.i137.us, i64 %mul5.i
229+
%add.ptr17.i.us = getelementptr inbounds half, ptr %s2.0.i136.us, i64 %mul5.i
230+
%add.ptr18.i.us = getelementptr inbounds half, ptr %s3.0.i135.us, i64 %mul5.i
231+
%add.ptr19.i.us = getelementptr inbounds half, ptr %s4.0.i134.us, i64 %mul5.i
232+
%cmp.i.us = icmp ult ptr %add.ptr15.i.us, %add.ptr.i
233+
br i1 %cmp.i.us, label %for.cond.i.preheader.us, label %while.cond.i..exit_crit_edge.us
234+
235+
while.cond.i..exit_crit_edge.us: ; preds = %for.cond.i.preheader.us
236+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
237+
%inc.us = add nuw nsw i32 %i4.0140.us, 1
238+
%exitcond.not = icmp eq i32 %inc.us, %kw
239+
br i1 %exitcond.not, label %exit78, label %for.body.us
240+
241+
exit78: ; preds = %while.cond.i..exit_crit_edge.us, %for.body.lr.ph, %entry
242+
ret void
243+
}

0 commit comments

Comments
 (0)