Skip to content

Commit a76cb16

Browse files
authored
[CIR][CIRGen] Generate CIR for vset_lane and vsetq_lane intrinsics (#882)
As title. Notice that for those intrinsics, just like OG, we do not lower to llvm intrinsics, instead, do vector insert. The test case is partially from OG [aarch64-neon-vget.c](https://github.com/llvm/clangir/blob/85bc6407f559221afebe08a60ed2b50bf1edf7fa/clang/test/CodeGen/aarch64-neon-vget.c) But, I did not do all signed and unsigned int tests because unsigned and signed of the same width essentially just use the same intrinsic ID thus exactly same code path as far as this PR concerns. --------- Co-authored-by: Guojin He <[email protected]>
1 parent 85bc640 commit a76cb16

File tree

2 files changed

+246
-2
lines changed

2 files changed

+246
-2
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2162,14 +2162,20 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
21622162
case NEON::BI__builtin_neon_vset_lane_i16:
21632163
case NEON::BI__builtin_neon_vset_lane_i32:
21642164
case NEON::BI__builtin_neon_vset_lane_i64:
2165-
case NEON::BI__builtin_neon_vset_lane_bf16:
21662165
case NEON::BI__builtin_neon_vset_lane_f32:
21672166
case NEON::BI__builtin_neon_vsetq_lane_i8:
21682167
case NEON::BI__builtin_neon_vsetq_lane_i16:
21692168
case NEON::BI__builtin_neon_vsetq_lane_i32:
21702169
case NEON::BI__builtin_neon_vsetq_lane_i64:
2171-
case NEON::BI__builtin_neon_vsetq_lane_bf16:
21722170
case NEON::BI__builtin_neon_vsetq_lane_f32:
2171+
Ops.push_back(buildScalarExpr(E->getArg(2)));
2172+
return builder.create<mlir::cir::VecInsertOp>(getLoc(E->getExprLoc()),
2173+
Ops[1], Ops[0], Ops[2]);
2174+
case NEON::BI__builtin_neon_vset_lane_bf16:
2175+
case NEON::BI__builtin_neon_vsetq_lane_bf16:
2176+
// No support for now as no real/test case for them
2177+
// at the moment, the implementation should be the same as above
2178+
// vset_lane or vsetq_lane intrinsics
21732179
llvm_unreachable("NYI");
21742180
case NEON::BI__builtin_neon_vset_lane_f64:
21752181
// The vector type needs a cast for the v1f64 variant.
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \
2+
// RUN: -emit-cir -target-feature +neon %s -o %t.cir
3+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
4+
// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \
5+
// RUN: -emit-llvm -target-feature +neon %s -o %t.ll
6+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
7+
8+
// This test file is similar to but not the same as
9+
// clang/test/CodeGen/aarch64-neon-vget.c
10+
// The difference is that this file only tests uses vset intrinsics, as we feel
11+
// it would be proper to have a separate test file testing vget intrinsics
12+
// with the file name aarch64-neon-vget.c
13+
// Also, for each integer type, we only test signed or unsigned, not both.
14+
// This is because integer types of the same size just use same intrinsic.
15+
16+
// REQUIRES: aarch64-registered-target || arm-registered-target
17+
#include <arm_neon.h>
18+
19+
uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
20+
return vset_lane_u8(a, b, 7);
21+
}
22+
23+
// CIR-LABEL: test_vset_lane_u8
24+
// CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i loc(#loc7)
25+
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s8i x 8>
26+
27+
// LLVM: define dso_local <8 x i8> @test_vset_lane_u8(i8 [[A:%.*]], <8 x i8> [[B:%.*]])
28+
// LLVM: [[A_ADR:%.*]] = alloca i8, i64 1, align 1
29+
// LLVM: [[B_ADR:%.*]] = alloca <8 x i8>, i64 1, align 8
30+
// LLVM: store i8 [[A]], ptr [[A_ADR]], align 1
31+
// LLVM: store <8 x i8> [[B]], ptr [[B_ADR]], align 8
32+
// LLVM: [[TMP_A0:%.*]] = load i8, ptr [[A_ADR]], align 1
33+
// LLVM: store i8 [[TMP_A0]], ptr [[S0:%.*]], align 1
34+
// LLVM: [[TMP_B0:%.*]] = load <8 x i8>, ptr [[B_ADR]], align 8
35+
// LLVM: store <8 x i8> [[TMP_B0]], ptr [[S1:%.*]], align 8
36+
// LLVM: [[INTRN_ARG0:%.*]] = load i8, ptr [[S0]], align 1
37+
// LLVM: [[INTRN_ARG1:%.*]] = load <8 x i8>, ptr [[S1]], align 8
38+
// LLVM: [[INTRN_RES:%.*]] = insertelement <8 x i8> [[INTRN_ARG1]], i8 [[INTRN_ARG0]], i32 7
39+
// LLVM: ret <8 x i8> {{%.*}}
40+
41+
uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
42+
return vset_lane_u16(a, b, 3);
43+
}
44+
45+
// CIR-LABEL: test_vset_lane_u16
46+
// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i
47+
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s16i x 4>
48+
49+
// LLVM: define dso_local <4 x i16> @test_vset_lane_u16(i16 [[A:%.*]], <4 x i16> [[B:%.*]])
50+
// LLVM: [[A_ADR:%.*]] = alloca i16, i64 1, align 2
51+
// LLVM: [[B_ADR:%.*]] = alloca <4 x i16>, i64 1, align 8
52+
// LLVM: store i16 [[A]], ptr [[A_ADR]], align 2
53+
// LLVM: store <4 x i16> [[B]], ptr [[B_ADR]], align 8
54+
// LLVM: [[TMP_A0:%.*]] = load i16, ptr [[A_ADR]], align 2
55+
// LLVM: store i16 [[TMP_A0]], ptr [[S0:%.*]], align 2
56+
// LLVM: [[TMP_B0:%.*]] = load <4 x i16>, ptr [[B_ADR]], align 8
57+
// LLVM: store <4 x i16> [[TMP_B0]], ptr [[S1:%.*]], align 8
58+
// LLVM: [[INTRN_ARG0:%.*]] = load i16, ptr [[S0]], align 2
59+
// LLVM: [[INTRN_ARG1:%.*]] = load <4 x i16>, ptr [[S1]], align 8
60+
// LLVM: [[INTRN_RES:%.*]] = insertelement <4 x i16> [[INTRN_ARG1]], i16 [[INTRN_ARG0]], i32 3
61+
// LLVM: ret <4 x i16> {{%.*}}
62+
63+
uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
64+
return vset_lane_u32(a, b, 1);
65+
}
66+
67+
// CIR-LABEL: test_vset_lane_u32
68+
// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i
69+
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s32i x 2>
70+
71+
// LLVM: define dso_local <2 x i32> @test_vset_lane_u32(i32 [[A:%.*]], <2 x i32> [[B:%.*]])
72+
// LLVM: [[A_ADR:%.*]] = alloca i32, i64 1, align 4
73+
// LLVM: [[B_ADR:%.*]] = alloca <2 x i32>, i64 1, align 8
74+
// LLVM: store i32 [[A]], ptr [[A_ADR]], align 4
75+
// LLVM: store <2 x i32> [[B]], ptr [[B_ADR]], align 8
76+
// LLVM: [[TMP_A0:%.*]] = load i32, ptr [[A_ADR]], align 4
77+
// LLVM: store i32 [[TMP_A0]], ptr [[S0:%.*]], align 4
78+
// LLVM: [[TMP_B0:%.*]] = load <2 x i32>, ptr [[B_ADR]], align 8
79+
// LLVM: store <2 x i32> [[TMP_B0]], ptr [[S1:%.*]], align 8
80+
// LLVM: [[INTRN_ARG0:%.*]] = load i32, ptr [[S0]], align 4
81+
// LLVM: [[INTRN_ARG1:%.*]] = load <2 x i32>, ptr [[S1]], align 8
82+
// LLVM: [[INTRN_RES:%.*]] = insertelement <2 x i32> [[INTRN_ARG1]], i32 [[INTRN_ARG0]], i32 1
83+
// LLVM: ret <2 x i32> {{%.*}}
84+
85+
86+
int64x1_t test_vset_lane_u64(int64_t a, int64x1_t b) {
87+
return vset_lane_u64(a, b, 0);
88+
}
89+
90+
// CIR-LABEL: test_vset_lane_u64
91+
// CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i
92+
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s64i x 1>
93+
94+
// LLVM: define dso_local <1 x i64> @test_vset_lane_u64(i64 [[A:%.*]], <1 x i64> [[B:%.*]])
95+
// LLVM: [[A_ADR:%.*]] = alloca i64, i64 1, align 8
96+
// LLVM: [[B_ADR:%.*]] = alloca <1 x i64>, i64 1, align 8
97+
// LLVM: store i64 [[A]], ptr [[A_ADR]], align 8
98+
// LLVM: store <1 x i64> [[B]], ptr [[B_ADR]], align 8
99+
// LLVM: [[TMP_A0:%.*]] = load i64, ptr [[A_ADR]], align 8
100+
// LLVM: store i64 [[TMP_A0]], ptr [[S0:%.*]], align 8
101+
// LLVM: [[TMP_B0:%.*]] = load <1 x i64>, ptr [[B_ADR]], align 8
102+
// LLVM: store <1 x i64> [[TMP_B0]], ptr [[S1:%.*]], align 8
103+
// LLVM: [[INTRN_ARG0:%.*]] = load i64, ptr [[S0]], align 8
104+
// LLVM: [[INTRN_ARG1:%.*]] = load <1 x i64>, ptr [[S1]], align 8
105+
// LLVM: [[INTRN_RES:%.*]] = insertelement <1 x i64> [[INTRN_ARG1]], i64 [[INTRN_ARG0]], i32 0
106+
// LLVM: ret <1 x i64> {{%.*}}
107+
108+
float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
109+
return vset_lane_f32(a, b, 1);
110+
}
111+
112+
// CIR-LABEL: test_vset_lane_f32
113+
// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i
114+
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.float x 2>
115+
116+
// LLVM: define dso_local <2 x float> @test_vset_lane_f32(float [[A:%.*]], <2 x float> [[B:%.*]])
117+
// LLVM: [[A_ADR:%.*]] = alloca float, i64 1, align 4
118+
// LLVM: [[B_ADR:%.*]] = alloca <2 x float>, i64 1, align 8
119+
// LLVM: store float [[A]], ptr [[A_ADR]], align 4
120+
// LLVM: store <2 x float> [[B]], ptr [[B_ADR]], align 8
121+
// LLVM: [[TMP_A0:%.*]] = load float, ptr [[A_ADR]], align 4
122+
// LLVM: store float [[TMP_A0]], ptr [[S0:%.*]], align 4
123+
// LLVM: [[TMP_B0:%.*]] = load <2 x float>, ptr [[B_ADR]], align 8
124+
// LLVM: store <2 x float> [[TMP_B0]], ptr [[S1:%.*]], align 8
125+
// LLVM: [[INTRN_ARG0:%.*]] = load float, ptr [[S0]], align 4
126+
// LLVM: [[INTRN_ARG1:%.*]] = load <2 x float>, ptr [[S1]], align 8
127+
// LLVM: [[INTRN_RES:%.*]] = insertelement <2 x float> [[INTRN_ARG1]], float [[INTRN_ARG0]], i32 1
128+
// LLVM: ret <2 x float> {{%.*}}
129+
130+
uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
131+
return vsetq_lane_u8(a, b, 15);
132+
}
133+
134+
// CIR-LABEL: test_vsetq_lane_u8
135+
// CIR: [[IDX:%.*]] = cir.const #cir.int<15> : !s32i
136+
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s8i x 16>
137+
138+
// LLVM: define dso_local <16 x i8> @test_vsetq_lane_u8(i8 [[A:%.*]], <16 x i8> [[B:%.*]])
139+
// LLVM: [[A_ADR:%.*]] = alloca i8, i64 1, align 1
140+
// LLVM: [[B_ADR:%.*]] = alloca <16 x i8>, i64 1, align 16
141+
// LLVM: store i8 [[A]], ptr [[A_ADR]], align 1
142+
// LLVM: store <16 x i8> [[B]], ptr [[B_ADR]], align 16
143+
// LLVM: [[TMP_A0:%.*]] = load i8, ptr [[A_ADR]], align 1
144+
// LLVM: store i8 [[TMP_A0]], ptr [[S0:%.*]], align 1
145+
// LLVM: [[TMP_B0:%.*]] = load <16 x i8>, ptr [[B_ADR]], align 16
146+
// LLVM: store <16 x i8> [[TMP_B0]], ptr [[S1:%.*]], align 16
147+
// LLVM: [[INTRN_ARG0:%.*]] = load i8, ptr [[S0]], align 1
148+
// LLVM: [[INTRN_ARG1:%.*]] = load <16 x i8>, ptr [[S1]], align 16
149+
// LLVM: [[INTRN_RES:%.*]] = insertelement <16 x i8> [[INTRN_ARG1]], i8 [[INTRN_ARG0]], i32 15
150+
// LLVM: ret <16 x i8> {{%.*}}
151+
152+
uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
153+
return vsetq_lane_u16(a, b, 7);
154+
}
155+
156+
// CIR-LABEL: test_vsetq_lane_u16
157+
// CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i
158+
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s16i x 8>
159+
160+
// LLVM: define dso_local <8 x i16> @test_vsetq_lane_u16(i16 [[A:%.*]], <8 x i16> [[B:%.*]])
161+
// LLVM: [[A_ADR:%.*]] = alloca i16, i64 1, align 2
162+
// LLVM: [[B_ADR:%.*]] = alloca <8 x i16>, i64 1, align 16
163+
// LLVM: store i16 [[A]], ptr [[A_ADR]], align 2
164+
// LLVM: store <8 x i16> [[B]], ptr [[B_ADR]], align 16
165+
// LLVM: [[TMP_A0:%.*]] = load i16, ptr [[A_ADR]], align 2
166+
// LLVM: store i16 [[TMP_A0]], ptr [[S0:%.*]], align 2
167+
// LLVM: [[TMP_B0:%.*]] = load <8 x i16>, ptr [[B_ADR]], align 16
168+
// LLVM: store <8 x i16> [[TMP_B0]], ptr [[S1:%.*]], align 16
169+
// LLVM: [[INTRN_ARG0:%.*]] = load i16, ptr [[S0]], align 2
170+
// LLVM: [[INTRN_ARG1:%.*]] = load <8 x i16>, ptr [[S1]], align 16
171+
// LLVM: [[INTRN_RES:%.*]] = insertelement <8 x i16> [[INTRN_ARG1]], i16 [[INTRN_ARG0]], i32 7
172+
// LLVM: ret <8 x i16> {{%.*}}
173+
174+
uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
175+
return vsetq_lane_u32(a, b, 3);
176+
}
177+
178+
// CIR-LABEL: test_vsetq_lane_u32
179+
// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i
180+
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s32i x 4>
181+
182+
// LLVM: define dso_local <4 x i32> @test_vsetq_lane_u32(i32 [[A:%.*]], <4 x i32> [[B:%.*]])
183+
// LLVM: [[A_ADR:%.*]] = alloca i32, i64 1, align 4
184+
// LLVM: [[B_ADR:%.*]] = alloca <4 x i32>, i64 1, align 16
185+
// LLVM: store i32 [[A]], ptr [[A_ADR]], align 4
186+
// LLVM: store <4 x i32> [[B]], ptr [[B_ADR]], align 16
187+
// LLVM: [[TMP_A0:%.*]] = load i32, ptr [[A_ADR]], align 4
188+
// LLVM: store i32 [[TMP_A0]], ptr [[S0:%.*]], align 4
189+
// LLVM: [[TMP_B0:%.*]] = load <4 x i32>, ptr [[B_ADR]], align 16
190+
// LLVM: store <4 x i32> [[TMP_B0]], ptr [[S1:%.*]], align 16
191+
// LLVM: [[INTRN_ARG0:%.*]] = load i32, ptr [[S0]], align 4
192+
// LLVM: [[INTRN_ARG1:%.*]] = load <4 x i32>, ptr [[S1]], align 16
193+
// LLVM: [[INTRN_RES:%.*]] = insertelement <4 x i32> [[INTRN_ARG1]], i32 [[INTRN_ARG0]], i32 3
194+
// LLVM: ret <4 x i32> {{%.*}}
195+
196+
int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
197+
return vsetq_lane_s64(a, b, 1);
198+
}
199+
200+
// CIR-LABEL: test_vsetq_lane_s64
201+
// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i
202+
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s64i x 2>
203+
204+
// LLVM: define dso_local <2 x i64> @test_vsetq_lane_s64(i64 [[A:%.*]], <2 x i64> [[B:%.*]])
205+
// LLVM: [[A_ADR:%.*]] = alloca i64, i64 1, align 8
206+
// LLVM: [[B_ADR:%.*]] = alloca <2 x i64>, i64 1, align 16
207+
// LLVM: store i64 [[A]], ptr [[A_ADR]], align 8
208+
// LLVM: store <2 x i64> [[B]], ptr [[B_ADR]], align 16
209+
// LLVM: [[TMP_A0:%.*]] = load i64, ptr [[A_ADR]], align 8
210+
// LLVM: store i64 [[TMP_A0]], ptr [[S0:%.*]], align 8
211+
// LLVM: [[TMP_B0:%.*]] = load <2 x i64>, ptr [[B_ADR]], align 16
212+
// LLVM: store <2 x i64> [[TMP_B0]], ptr [[S1:%.*]], align 16
213+
// LLVM: [[INTRN_ARG0:%.*]] = load i64, ptr [[S0]], align 8
214+
// LLVM: [[INTRN_ARG1:%.*]] = load <2 x i64>, ptr [[S1]], align 16
215+
// LLVM: [[INTRN_RES:%.*]] = insertelement <2 x i64> [[INTRN_ARG1]], i64 [[INTRN_ARG0]], i32 1
216+
// LLVM: ret <2 x i64> {{%.*}}
217+
218+
float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
219+
return vsetq_lane_f32(a, b, 3);
220+
}
221+
222+
// CIR-LABEL: test_vsetq_lane_f32
223+
// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i
224+
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.float x 4>
225+
226+
// LLVM: define dso_local <4 x float> @test_vsetq_lane_f32(float [[A:%.*]], <4 x float> [[B:%.*]])
227+
// LLVM: [[A_ADR:%.*]] = alloca float, i64 1, align 4
228+
// LLVM: [[B_ADR:%.*]] = alloca <4 x float>, i64 1, align 16
229+
// LLVM: store float [[A]], ptr [[A_ADR]], align 4
230+
// LLVM: store <4 x float> [[B]], ptr [[B_ADR]], align 16
231+
// LLVM: [[TMP_A0:%.*]] = load float, ptr [[A_ADR]], align 4
232+
// LLVM: store float [[TMP_A0]], ptr [[S0:%.*]], align 4
233+
// LLVM: [[TMP_B0:%.*]] = load <4 x float>, ptr [[B_ADR]], align 16
234+
// LLVM: store <4 x float> [[TMP_B0]], ptr [[S1:%.*]], align 16
235+
// LLVM: [[INTRN_ARG0:%.*]] = load float, ptr [[S0]], align 4
236+
// LLVM: [[INTRN_ARG1:%.*]] = load <4 x float>, ptr [[S1]], align 16
237+
// LLVM: [[INTRN_RES:%.*]] = insertelement <4 x float> [[INTRN_ARG1]], float [[INTRN_ARG0]], i32 3
238+
// LLVM: ret <4 x float> {{%.*}}

0 commit comments

Comments
 (0)