Skip to content

Commit 07ee9bd

Browse files
committed
[RISCV] Add fixed vector coverage for sum-absolute-difference (sad) pattern
This builds on the previously added absolute difference cases, and adds the reduction at the end. This is mostly interesting for examining impact of extend placement when changing the abdu lowering.
1 parent 4b941ff commit 07ee9bd

File tree

1 file changed

+191
-0
lines changed

1 file changed

+191
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s
3+
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
4+
5+
define signext i16 @sad_4x8_as_i16(<4 x i8> %a, <4 x i8> %b) {
6+
; CHECK-LABEL: sad_4x8_as_i16:
7+
; CHECK: # %bb.0: # %entry
8+
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
9+
; CHECK-NEXT: vwsubu.vv v10, v8, v9
10+
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
11+
; CHECK-NEXT: vrsub.vi v8, v10, 0
12+
; CHECK-NEXT: vmax.vv v8, v10, v8
13+
; CHECK-NEXT: vmv.s.x v9, zero
14+
; CHECK-NEXT: vredsum.vs v8, v8, v9
15+
; CHECK-NEXT: vmv.x.s a0, v8
16+
; CHECK-NEXT: ret
17+
entry:
18+
%1 = zext <4 x i8> %a to <4 x i16>
19+
%3 = zext <4 x i8> %b to <4 x i16>
20+
%4 = sub nsw <4 x i16> %1, %3
21+
%5 = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> %4, i1 true)
22+
%6 = tail call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %5)
23+
ret i16 %6
24+
}
25+
26+
define signext i32 @sad_4x8_as_i32(<4 x i8> %a, <4 x i8> %b) {
27+
; CHECK-LABEL: sad_4x8_as_i32:
28+
; CHECK: # %bb.0: # %entry
29+
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
30+
; CHECK-NEXT: vwsubu.vv v10, v8, v9
31+
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
32+
; CHECK-NEXT: vrsub.vi v8, v10, 0
33+
; CHECK-NEXT: vmax.vv v8, v10, v8
34+
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
35+
; CHECK-NEXT: vmv.s.x v9, zero
36+
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
37+
; CHECK-NEXT: vwredsumu.vs v8, v8, v9
38+
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
39+
; CHECK-NEXT: vmv.x.s a0, v8
40+
; CHECK-NEXT: ret
41+
entry:
42+
%1 = zext <4 x i8> %a to <4 x i32>
43+
%3 = zext <4 x i8> %b to <4 x i32>
44+
%4 = sub nsw <4 x i32> %1, %3
45+
%5 = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %4, i1 true)
46+
%6 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
47+
ret i32 %6
48+
}
49+
50+
define signext i16 @sad_16x8_as_i16(<16 x i8> %a, <16 x i8> %b) {
51+
; CHECK-LABEL: sad_16x8_as_i16:
52+
; CHECK: # %bb.0: # %entry
53+
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
54+
; CHECK-NEXT: vwsubu.vv v10, v8, v9
55+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
56+
; CHECK-NEXT: vrsub.vi v8, v10, 0
57+
; CHECK-NEXT: vmax.vv v8, v10, v8
58+
; CHECK-NEXT: vmv.s.x v10, zero
59+
; CHECK-NEXT: vredsum.vs v8, v8, v10
60+
; CHECK-NEXT: vmv.x.s a0, v8
61+
; CHECK-NEXT: ret
62+
entry:
63+
%1 = zext <16 x i8> %a to <16 x i16>
64+
%3 = zext <16 x i8> %b to <16 x i16>
65+
%4 = sub nsw <16 x i16> %1, %3
66+
%5 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %4, i1 true)
67+
%6 = tail call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %5)
68+
ret i16 %6
69+
}
70+
71+
define signext i32 @sad_16x8_as_i32(<16 x i8> %a, <16 x i8> %b) {
72+
; CHECK-LABEL: sad_16x8_as_i32:
73+
; CHECK: # %bb.0: # %entry
74+
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
75+
; CHECK-NEXT: vwsubu.vv v10, v8, v9
76+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
77+
; CHECK-NEXT: vrsub.vi v8, v10, 0
78+
; CHECK-NEXT: vmax.vv v8, v10, v8
79+
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
80+
; CHECK-NEXT: vmv.s.x v10, zero
81+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
82+
; CHECK-NEXT: vwredsumu.vs v8, v8, v10
83+
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
84+
; CHECK-NEXT: vmv.x.s a0, v8
85+
; CHECK-NEXT: ret
86+
entry:
87+
%1 = zext <16 x i8> %a to <16 x i32>
88+
%3 = zext <16 x i8> %b to <16 x i32>
89+
%4 = sub nsw <16 x i32> %1, %3
90+
%5 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %4, i1 true)
91+
%6 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
92+
ret i32 %6
93+
}
94+
95+
define signext i32 @sad_2block_16xi8_as_i32(ptr %a, ptr %b, i32 signext %stridea, i32 signext %strideb) {
96+
; CHECK-LABEL: sad_2block_16xi8_as_i32:
97+
; CHECK: # %bb.0: # %entry
98+
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
99+
; CHECK-NEXT: vle8.v v8, (a0)
100+
; CHECK-NEXT: vle8.v v9, (a1)
101+
; CHECK-NEXT: vwsubu.vv v10, v8, v9
102+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
103+
; CHECK-NEXT: add a0, a0, a2
104+
; CHECK-NEXT: add a1, a1, a3
105+
; CHECK-NEXT: vle8.v v8, (a0)
106+
; CHECK-NEXT: vle8.v v9, (a1)
107+
; CHECK-NEXT: vrsub.vi v12, v10, 0
108+
; CHECK-NEXT: vmax.vv v12, v10, v12
109+
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
110+
; CHECK-NEXT: vwsubu.vv v10, v8, v9
111+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
112+
; CHECK-NEXT: vrsub.vi v8, v10, 0
113+
; CHECK-NEXT: add a0, a0, a2
114+
; CHECK-NEXT: add a1, a1, a3
115+
; CHECK-NEXT: vle8.v v14, (a0)
116+
; CHECK-NEXT: vle8.v v15, (a1)
117+
; CHECK-NEXT: vmax.vv v16, v10, v8
118+
; CHECK-NEXT: vwaddu.vv v8, v16, v12
119+
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
120+
; CHECK-NEXT: vwsubu.vv v12, v14, v15
121+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
122+
; CHECK-NEXT: vrsub.vi v14, v12, 0
123+
; CHECK-NEXT: add a0, a0, a2
124+
; CHECK-NEXT: add a1, a1, a3
125+
; CHECK-NEXT: vle8.v v16, (a0)
126+
; CHECK-NEXT: vle8.v v17, (a1)
127+
; CHECK-NEXT: vmax.vv v12, v12, v14
128+
; CHECK-NEXT: vwaddu.wv v8, v8, v12
129+
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
130+
; CHECK-NEXT: vwsubu.vv v12, v16, v17
131+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
132+
; CHECK-NEXT: vrsub.vi v14, v12, 0
133+
; CHECK-NEXT: vmax.vv v12, v12, v14
134+
; CHECK-NEXT: vwaddu.wv v8, v8, v12
135+
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
136+
; CHECK-NEXT: vmv.s.x v12, zero
137+
; CHECK-NEXT: vredsum.vs v8, v8, v12
138+
; CHECK-NEXT: vmv.x.s a0, v8
139+
; CHECK-NEXT: ret
140+
entry:
141+
%idx.ext8 = sext i32 %strideb to i64
142+
%idx.ext = sext i32 %stridea to i64
143+
%0 = load <16 x i8>, ptr %a, align 1
144+
%1 = zext <16 x i8> %0 to <16 x i32>
145+
%2 = load <16 x i8>, ptr %b, align 1
146+
%3 = zext <16 x i8> %2 to <16 x i32>
147+
%4 = sub nsw <16 x i32> %1, %3
148+
%5 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %4, i1 true)
149+
%6 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
150+
%add.ptr = getelementptr inbounds i8, ptr %a, i64 %idx.ext
151+
%add.ptr9 = getelementptr inbounds i8, ptr %b, i64 %idx.ext8
152+
%7 = load <16 x i8>, ptr %add.ptr, align 1
153+
%8 = zext <16 x i8> %7 to <16 x i32>
154+
%9 = load <16 x i8>, ptr %add.ptr9, align 1
155+
%10 = zext <16 x i8> %9 to <16 x i32>
156+
%11 = sub nsw <16 x i32> %8, %10
157+
%12 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %11, i1 true)
158+
%13 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %12)
159+
%op.rdx.1 = add i32 %13, %6
160+
%add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
161+
%add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8
162+
%14 = load <16 x i8>, ptr %add.ptr.1, align 1
163+
%15 = zext <16 x i8> %14 to <16 x i32>
164+
%16 = load <16 x i8>, ptr %add.ptr9.1, align 1
165+
%17 = zext <16 x i8> %16 to <16 x i32>
166+
%18 = sub nsw <16 x i32> %15, %17
167+
%19 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %18, i1 true)
168+
%20 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %19)
169+
%op.rdx.2 = add i32 %20, %op.rdx.1
170+
%add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
171+
%add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8
172+
%21 = load <16 x i8>, ptr %add.ptr.2, align 1
173+
%22 = zext <16 x i8> %21 to <16 x i32>
174+
%23 = load <16 x i8>, ptr %add.ptr9.2, align 1
175+
%24 = zext <16 x i8> %23 to <16 x i32>
176+
%25 = sub nsw <16 x i32> %22, %24
177+
%26 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %25, i1 true)
178+
%27 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %26)
179+
%op.rdx.3 = add i32 %27, %op.rdx.2
180+
ret i32 %op.rdx.3
181+
}
182+
183+
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
184+
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
185+
declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1)
186+
declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
187+
188+
declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1)
189+
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
190+
declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1)
191+
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)

0 commit comments

Comments
 (0)