Skip to content

Commit 144b2f5

Browse files
authored
[RISCV] Start vslide1down sequence with a dependency breaking splat (llvm#72691)
If we are using entirely vslide1downs to initialize an otherwise undef vector, we end up with an implicit_def as the source of the first vslide1down. This register has to be allocated, and creates false dependencies with surrounding code. Instead, start our sequence with a vmv.v.x in the hopes of creating a dependency breaking idiom. Unfortunately, it's not clear this will actually work as due to the VL=0 special case for T.A. the hardware has to work pretty hard to recognize that the vmv.v.x actually has no source dependence. I don't think we can reasonable expect all hardware to have optimized this case, but I also don't see any downside in prefering it.
1 parent fab690d commit 144b2f5

17 files changed

+303
-331
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3798,13 +3798,24 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
37983798

37993799
const unsigned Policy = RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC;
38003800

3801-
SDValue Vec = DAG.getUNDEF(ContainerVT);
3801+
SDValue Vec;
38023802
UndefCount = 0;
38033803
for (SDValue V : Op->ops()) {
38043804
if (V.isUndef()) {
38053805
UndefCount++;
38063806
continue;
38073807
}
3808+
3809+
// Start our sequence with a TA splat in the hopes that hardware is able to
3810+
// recognize there's no dependency on the prior value of our temporary
3811+
// register.
3812+
if (!Vec) {
3813+
Vec = DAG.getSplatVector(VT, DL, V);
3814+
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
3815+
UndefCount = 0;
3816+
continue;
3817+
}
3818+
38083819
if (UndefCount) {
38093820
const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
38103821
Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ define <4 x i16> @bitcast_i64_v4i16(i64 %a) {
495495
; RV32ELEN32-LABEL: bitcast_i64_v4i16:
496496
; RV32ELEN32: # %bb.0:
497497
; RV32ELEN32-NEXT: vsetivli zero, 2, e32, m1, ta, ma
498-
; RV32ELEN32-NEXT: vslide1down.vx v8, v8, a0
498+
; RV32ELEN32-NEXT: vmv.v.x v8, a0
499499
; RV32ELEN32-NEXT: vslide1down.vx v8, v8, a1
500500
; RV32ELEN32-NEXT: ret
501501
;
@@ -530,7 +530,7 @@ define <2 x i32> @bitcast_i64_v2i32(i64 %a) {
530530
; RV32ELEN32-LABEL: bitcast_i64_v2i32:
531531
; RV32ELEN32: # %bb.0:
532532
; RV32ELEN32-NEXT: vsetivli zero, 2, e32, m1, ta, ma
533-
; RV32ELEN32-NEXT: vslide1down.vx v8, v8, a0
533+
; RV32ELEN32-NEXT: vmv.v.x v8, a0
534534
; RV32ELEN32-NEXT: vslide1down.vx v8, v8, a1
535535
; RV32ELEN32-NEXT: ret
536536
;

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll

Lines changed: 29 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ define <4 x i32> @add_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
66
; CHECK-LABEL: add_constant_rhs:
77
; CHECK: # %bb.0:
88
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
9-
; CHECK-NEXT: vslide1down.vx v8, v8, a0
9+
; CHECK-NEXT: vmv.v.x v8, a0
1010
; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
1111
; CHECK-NEXT: addi a0, a0, %lo(.LCPI0_0)
1212
; CHECK-NEXT: vle32.v v9, (a0)
@@ -30,7 +30,7 @@ define <8 x i32> @add_constant_rhs_8xi32(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e,
3030
; CHECK-LABEL: add_constant_rhs_8xi32:
3131
; CHECK: # %bb.0:
3232
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
33-
; CHECK-NEXT: vslide1down.vx v8, v8, a0
33+
; CHECK-NEXT: vmv.v.x v8, a0
3434
; CHECK-NEXT: vslide1down.vx v8, v8, a1
3535
; CHECK-NEXT: vslide1down.vx v8, v8, a2
3636
; CHECK-NEXT: vslide1down.vx v8, v8, a3
@@ -67,7 +67,7 @@ define <4 x i32> @sub_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
6767
; CHECK-LABEL: sub_constant_rhs:
6868
; CHECK: # %bb.0:
6969
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
70-
; CHECK-NEXT: vslide1down.vx v8, v8, a0
70+
; CHECK-NEXT: vmv.v.x v8, a0
7171
; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
7272
; CHECK-NEXT: addi a0, a0, %lo(.LCPI2_0)
7373
; CHECK-NEXT: vle32.v v9, (a0)
@@ -91,7 +91,7 @@ define <4 x i32> @mul_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
9191
; CHECK-LABEL: mul_constant_rhs:
9292
; CHECK: # %bb.0:
9393
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
94-
; CHECK-NEXT: vslide1down.vx v8, v8, a0
94+
; CHECK-NEXT: vmv.v.x v8, a0
9595
; CHECK-NEXT: lui a0, %hi(.LCPI3_0)
9696
; CHECK-NEXT: addi a0, a0, %lo(.LCPI3_0)
9797
; CHECK-NEXT: vle32.v v9, (a0)
@@ -115,7 +115,7 @@ define <4 x i32> @udiv_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
115115
; CHECK-LABEL: udiv_constant_rhs:
116116
; CHECK: # %bb.0:
117117
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
118-
; CHECK-NEXT: vslide1down.vx v8, v8, a0
118+
; CHECK-NEXT: vmv.v.x v8, a0
119119
; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
120120
; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0)
121121
; CHECK-NEXT: vle32.v v9, (a0)
@@ -152,7 +152,7 @@ define <4 x float> @fadd_constant_rhs(float %a, float %b, float %c, float %d) {
152152
; CHECK-LABEL: fadd_constant_rhs:
153153
; CHECK: # %bb.0:
154154
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
155-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
155+
; CHECK-NEXT: vfmv.v.f v8, fa0
156156
; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
157157
; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0)
158158
; CHECK-NEXT: vle32.v v9, (a0)
@@ -176,7 +176,7 @@ define <4 x float> @fdiv_constant_rhs(float %a, float %b, float %c, float %d) {
176176
; CHECK-LABEL: fdiv_constant_rhs:
177177
; CHECK: # %bb.0:
178178
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
179-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
179+
; CHECK-NEXT: vfmv.v.f v8, fa0
180180
; CHECK-NEXT: lui a0, %hi(.LCPI6_0)
181181
; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0)
182182
; CHECK-NEXT: vle32.v v9, (a0)
@@ -200,7 +200,7 @@ define <4 x i32> @add_constant_rhs_splat(i32 %a, i32 %b, i32 %c, i32 %d) {
200200
; CHECK-LABEL: add_constant_rhs_splat:
201201
; CHECK: # %bb.0:
202202
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
203-
; CHECK-NEXT: vslide1down.vx v8, v8, a0
203+
; CHECK-NEXT: vmv.v.x v8, a0
204204
; CHECK-NEXT: vslide1down.vx v8, v8, a1
205205
; CHECK-NEXT: vslide1down.vx v8, v8, a2
206206
; CHECK-NEXT: vslide1down.vx v8, v8, a3
@@ -226,7 +226,7 @@ define <4 x i32> @add_constant_rhs_with_identity(i32 %a, i32 %b, i32 %c, i32 %d)
226226
; RV32-NEXT: addi a3, a3, 2047
227227
; RV32-NEXT: addi a3, a3, 308
228228
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
229-
; RV32-NEXT: vslide1down.vx v8, v8, a0
229+
; RV32-NEXT: vmv.v.x v8, a0
230230
; RV32-NEXT: vslide1down.vx v8, v8, a1
231231
; RV32-NEXT: vslide1down.vx v8, v8, a2
232232
; RV32-NEXT: vslide1down.vx v8, v8, a3
@@ -239,7 +239,7 @@ define <4 x i32> @add_constant_rhs_with_identity(i32 %a, i32 %b, i32 %c, i32 %d)
239239
; RV64-NEXT: addi a3, a3, 2047
240240
; RV64-NEXT: addiw a3, a3, 308
241241
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
242-
; RV64-NEXT: vslide1down.vx v8, v8, a0
242+
; RV64-NEXT: vmv.v.x v8, a0
243243
; RV64-NEXT: vslide1down.vx v8, v8, a1
244244
; RV64-NEXT: vslide1down.vx v8, v8, a2
245245
; RV64-NEXT: vslide1down.vx v8, v8, a3
@@ -263,7 +263,7 @@ define <4 x i32> @add_constant_rhs_identity(i32 %a, i32 %b, i32 %c, i32 %d) {
263263
; RV32-NEXT: addi a3, a3, 2047
264264
; RV32-NEXT: addi a3, a3, 308
265265
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
266-
; RV32-NEXT: vslide1down.vx v8, v8, a0
266+
; RV32-NEXT: vmv.v.x v8, a0
267267
; RV32-NEXT: vslide1down.vx v8, v8, a1
268268
; RV32-NEXT: vslide1down.vx v8, v8, a2
269269
; RV32-NEXT: vslide1down.vx v8, v8, a3
@@ -276,7 +276,7 @@ define <4 x i32> @add_constant_rhs_identity(i32 %a, i32 %b, i32 %c, i32 %d) {
276276
; RV64-NEXT: addi a3, a3, 2047
277277
; RV64-NEXT: addiw a3, a3, 308
278278
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
279-
; RV64-NEXT: vslide1down.vx v8, v8, a0
279+
; RV64-NEXT: vmv.v.x v8, a0
280280
; RV64-NEXT: vslide1down.vx v8, v8, a1
281281
; RV64-NEXT: vslide1down.vx v8, v8, a2
282282
; RV64-NEXT: vslide1down.vx v8, v8, a3
@@ -293,25 +293,15 @@ define <4 x i32> @add_constant_rhs_identity(i32 %a, i32 %b, i32 %c, i32 %d) {
293293
}
294294

295295
define <4 x i32> @add_constant_rhs_identity2(i32 %a, i32 %b, i32 %c, i32 %d) {
296-
; RV32-LABEL: add_constant_rhs_identity2:
297-
; RV32: # %bb.0:
298-
; RV32-NEXT: addi a0, a0, 23
299-
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
300-
; RV32-NEXT: vslide1down.vx v8, v8, a0
301-
; RV32-NEXT: vslide1down.vx v8, v8, a1
302-
; RV32-NEXT: vslide1down.vx v8, v8, a2
303-
; RV32-NEXT: vslide1down.vx v8, v8, a3
304-
; RV32-NEXT: ret
305-
;
306-
; RV64-LABEL: add_constant_rhs_identity2:
307-
; RV64: # %bb.0:
308-
; RV64-NEXT: addiw a0, a0, 23
309-
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
310-
; RV64-NEXT: vslide1down.vx v8, v8, a0
311-
; RV64-NEXT: vslide1down.vx v8, v8, a1
312-
; RV64-NEXT: vslide1down.vx v8, v8, a2
313-
; RV64-NEXT: vslide1down.vx v8, v8, a3
314-
; RV64-NEXT: ret
296+
; CHECK-LABEL: add_constant_rhs_identity2:
297+
; CHECK: # %bb.0:
298+
; CHECK-NEXT: addi a0, a0, 23
299+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
300+
; CHECK-NEXT: vmv.v.x v8, a0
301+
; CHECK-NEXT: vslide1down.vx v8, v8, a1
302+
; CHECK-NEXT: vslide1down.vx v8, v8, a2
303+
; CHECK-NEXT: vslide1down.vx v8, v8, a3
304+
; CHECK-NEXT: ret
315305
%e0 = add i32 %a, 23
316306
%v0 = insertelement <4 x i32> poison, i32 %e0, i32 0
317307
%v1 = insertelement <4 x i32> %v0, i32 %b, i32 1
@@ -324,7 +314,7 @@ define <4 x i32> @add_constant_rhs_inverse(i32 %a, i32 %b, i32 %c, i32 %d) {
324314
; CHECK-LABEL: add_constant_rhs_inverse:
325315
; CHECK: # %bb.0:
326316
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
327-
; CHECK-NEXT: vslide1down.vx v8, v8, a0
317+
; CHECK-NEXT: vmv.v.x v8, a0
328318
; CHECK-NEXT: lui a0, %hi(.LCPI11_0)
329319
; CHECK-NEXT: addi a0, a0, %lo(.LCPI11_0)
330320
; CHECK-NEXT: vle32.v v9, (a0)
@@ -348,7 +338,7 @@ define <4 x i32> @add_constant_rhs_commute(i32 %a, i32 %b, i32 %c, i32 %d) {
348338
; CHECK-LABEL: add_constant_rhs_commute:
349339
; CHECK: # %bb.0:
350340
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
351-
; CHECK-NEXT: vslide1down.vx v8, v8, a0
341+
; CHECK-NEXT: vmv.v.x v8, a0
352342
; CHECK-NEXT: lui a0, %hi(.LCPI12_0)
353343
; CHECK-NEXT: addi a0, a0, %lo(.LCPI12_0)
354344
; CHECK-NEXT: vle32.v v9, (a0)
@@ -377,20 +367,20 @@ define <4 x i32> @add_general_rhs(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f
377367
; RV32-NEXT: add a2, a2, a6
378368
; RV32-NEXT: add a3, a3, a7
379369
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
380-
; RV32-NEXT: vslide1down.vx v8, v8, a0
370+
; RV32-NEXT: vmv.v.x v8, a0
381371
; RV32-NEXT: vslide1down.vx v8, v8, a1
382372
; RV32-NEXT: vslide1down.vx v8, v8, a2
383373
; RV32-NEXT: vslide1down.vx v8, v8, a3
384374
; RV32-NEXT: ret
385375
;
386376
; RV64-LABEL: add_general_rhs:
387377
; RV64: # %bb.0:
388-
; RV64-NEXT: addw a0, a0, a4
378+
; RV64-NEXT: add a0, a0, a4
389379
; RV64-NEXT: addw a1, a1, a5
390380
; RV64-NEXT: addw a2, a2, a6
391381
; RV64-NEXT: addw a3, a3, a7
392382
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
393-
; RV64-NEXT: vslide1down.vx v8, v8, a0
383+
; RV64-NEXT: vmv.v.x v8, a0
394384
; RV64-NEXT: vslide1down.vx v8, v8, a1
395385
; RV64-NEXT: vslide1down.vx v8, v8, a2
396386
; RV64-NEXT: vslide1down.vx v8, v8, a3
@@ -414,20 +404,20 @@ define <4 x i32> @add_general_splat(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
414404
; RV32-NEXT: add a2, a2, a4
415405
; RV32-NEXT: add a3, a3, a4
416406
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
417-
; RV32-NEXT: vslide1down.vx v8, v8, a0
407+
; RV32-NEXT: vmv.v.x v8, a0
418408
; RV32-NEXT: vslide1down.vx v8, v8, a1
419409
; RV32-NEXT: vslide1down.vx v8, v8, a2
420410
; RV32-NEXT: vslide1down.vx v8, v8, a3
421411
; RV32-NEXT: ret
422412
;
423413
; RV64-LABEL: add_general_splat:
424414
; RV64: # %bb.0:
425-
; RV64-NEXT: addw a0, a0, a4
415+
; RV64-NEXT: add a0, a0, a4
426416
; RV64-NEXT: addw a1, a1, a4
427417
; RV64-NEXT: addw a2, a2, a4
428418
; RV64-NEXT: addw a3, a3, a4
429419
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
430-
; RV64-NEXT: vslide1down.vx v8, v8, a0
420+
; RV64-NEXT: vmv.v.x v8, a0
431421
; RV64-NEXT: vslide1down.vx v8, v8, a1
432422
; RV64-NEXT: vslide1down.vx v8, v8, a2
433423
; RV64-NEXT: vslide1down.vx v8, v8, a3

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ define <2 x half> @buildvec_v2f16(half %a, half %b) {
252252
; CHECK-LABEL: buildvec_v2f16:
253253
; CHECK: # %bb.0:
254254
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
255-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
255+
; CHECK-NEXT: vfmv.v.f v8, fa0
256256
; CHECK-NEXT: vfslide1down.vf v8, v8, fa1
257257
; CHECK-NEXT: ret
258258
%v1 = insertelement <2 x half> poison, half %a, i64 0
@@ -264,7 +264,7 @@ define <2 x float> @buildvec_v2f32(float %a, float %b) {
264264
; CHECK-LABEL: buildvec_v2f32:
265265
; CHECK: # %bb.0:
266266
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
267-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
267+
; CHECK-NEXT: vfmv.v.f v8, fa0
268268
; CHECK-NEXT: vfslide1down.vf v8, v8, fa1
269269
; CHECK-NEXT: ret
270270
%v1 = insertelement <2 x float> poison, float %a, i64 0
@@ -276,7 +276,7 @@ define <2 x double> @buildvec_v2f64(double %a, double %b) {
276276
; CHECK-LABEL: buildvec_v2f64:
277277
; CHECK: # %bb.0:
278278
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
279-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
279+
; CHECK-NEXT: vfmv.v.f v8, fa0
280280
; CHECK-NEXT: vfslide1down.vf v8, v8, fa1
281281
; CHECK-NEXT: ret
282282
%v1 = insertelement <2 x double> poison, double %a, i64 0
@@ -288,7 +288,7 @@ define <2 x double> @buildvec_v2f64_b(double %a, double %b) {
288288
; CHECK-LABEL: buildvec_v2f64_b:
289289
; CHECK: # %bb.0:
290290
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
291-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
291+
; CHECK-NEXT: vfmv.v.f v8, fa0
292292
; CHECK-NEXT: vfslide1down.vf v8, v8, fa1
293293
; CHECK-NEXT: ret
294294
%v1 = insertelement <2 x double> poison, double %b, i64 1
@@ -300,7 +300,7 @@ define <4 x float> @buildvec_v4f32(float %a, float %b, float %c, float %d) {
300300
; CHECK-LABEL: buildvec_v4f32:
301301
; CHECK: # %bb.0:
302302
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
303-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
303+
; CHECK-NEXT: vfmv.v.f v8, fa0
304304
; CHECK-NEXT: vfslide1down.vf v8, v8, fa1
305305
; CHECK-NEXT: vfslide1down.vf v8, v8, fa2
306306
; CHECK-NEXT: vfslide1down.vf v8, v8, fa3
@@ -316,7 +316,7 @@ define <8 x float> @buildvec_v8f32(float %e0, float %e1, float %e2, float %e3, f
316316
; CHECK-LABEL: buildvec_v8f32:
317317
; CHECK: # %bb.0:
318318
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
319-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
319+
; CHECK-NEXT: vfmv.v.f v8, fa0
320320
; CHECK-NEXT: vfslide1down.vf v8, v8, fa1
321321
; CHECK-NEXT: vfslide1down.vf v8, v8, fa2
322322
; CHECK-NEXT: vfslide1down.vf v8, v8, fa3

0 commit comments

Comments
 (0)