Skip to content

Commit 87af9ee

Browse files
authored
[RISCV] Use experimental.vp.splat to splat specific vector length elements. (#101329)
Previously, llvm IR is hard to create a scalable vector splat with a specific vector length, so we use riscv.vmv.v.x and riscv.vmv.v.f to do this work. But the two rvv intrinsics needs strict type constraint which can not support fixed vector types and illegal vector types. Using vp.splat could preserve old functionality and also generate more optimized code for vector types and illegal vectors. This patch also fixes crash for getEVT not serving ptr types.
1 parent 5dbbc3b commit 87af9ee

File tree

3 files changed

+79
-28
lines changed

3 files changed

+79
-28
lines changed

llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp

+2-17
Original file line numberDiff line numberDiff line change
@@ -187,25 +187,10 @@ bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) {
187187
auto *VTy = cast<VectorType>(II.getType());
188188

189189
IRBuilder<> Builder(&II);
190-
191-
// Extend VL from i32 to XLen if needed.
192-
if (ST->is64Bit())
193-
VL = Builder.CreateZExt(VL, Builder.getInt64Ty());
194-
195190
Type *STy = VTy->getElementType();
196191
Value *Val = Builder.CreateLoad(STy, BasePtr);
197-
const auto &TLI = *ST->getTargetLowering();
198-
Value *Res;
199-
200-
// TODO: Also support fixed/illegal vector types to splat with evl = vl.
201-
if (isa<ScalableVectorType>(VTy) && TLI.isTypeLegal(EVT::getEVT(VTy))) {
202-
unsigned VMVOp = STy->isFloatingPointTy() ? Intrinsic::riscv_vfmv_v_f
203-
: Intrinsic::riscv_vmv_v_x;
204-
Res = Builder.CreateIntrinsic(VMVOp, {VTy, VL->getType()},
205-
{PoisonValue::get(VTy), Val, VL});
206-
} else {
207-
Res = Builder.CreateVectorSplat(VTy->getElementCount(), Val);
208-
}
192+
Value *Res = Builder.CreateIntrinsic(Intrinsic::experimental_vp_splat, {VTy},
193+
{Val, II.getOperand(2), VL});
209194

210195
II.replaceAllUsesWith(Res);
211196
II.eraseFromParent();

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -638,14 +638,14 @@ declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64,
638638
define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
639639
; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
640640
; CHECK-OPT: # %bb.0:
641-
; CHECK-OPT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
641+
; CHECK-OPT-NEXT: vsetivli zero, 3, e8, mf4, ta, ma
642642
; CHECK-OPT-NEXT: vlse8.v v8, (a0), zero
643643
; CHECK-OPT-NEXT: ret
644644
;
645645
; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
646646
; CHECK-NO-OPT: # %bb.0:
647647
; CHECK-NO-OPT-NEXT: lbu a0, 0(a0)
648-
; CHECK-NO-OPT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
648+
; CHECK-NO-OPT-NEXT: vsetivli zero, 3, e8, mf4, ta, ma
649649
; CHECK-NO-OPT-NEXT: vmv.v.x v8, a0
650650
; CHECK-NO-OPT-NEXT: ret
651651
%load = call <4 x i8> @llvm.experimental.vp.strided.load.4i8.p0.i8(ptr %ptr, i8 0, <4 x i1> splat (i1 true), i32 3)
@@ -657,14 +657,14 @@ define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
657657
define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) {
658658
; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
659659
; CHECK-OPT: # %bb.0:
660-
; CHECK-OPT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
660+
; CHECK-OPT-NEXT: vsetivli zero, 3, e16, mf2, ta, ma
661661
; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero
662662
; CHECK-OPT-NEXT: ret
663663
;
664664
; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
665665
; CHECK-NO-OPT: # %bb.0:
666666
; CHECK-NO-OPT-NEXT: flh fa5, 0(a0)
667-
; CHECK-NO-OPT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
667+
; CHECK-NO-OPT-NEXT: vsetivli zero, 3, e16, mf2, ta, ma
668668
; CHECK-NO-OPT-NEXT: vfmv.v.f v8, fa5
669669
; CHECK-NO-OPT-NEXT: ret
670670
%load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 3)

llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll

+73-7
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \
33
; RUN: -verify-machineinstrs < %s \
4-
; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT
4+
; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT,CHECK-OPT-RV32
55
; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \
66
; RUN: -verify-machineinstrs < %s \
7-
; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT
7+
; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT,CHECK-OPT-RV64
88
; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \
99
; RUN: -verify-machineinstrs < %s \
10-
; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT
10+
; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT,CHECK-NO-OPT-RV32
1111
; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \
1212
; RUN: -verify-machineinstrs < %s \
13-
; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT
13+
; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT,CHECK-NO-OPT-RV64
1414

1515
declare <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr, i8, <vscale x 1 x i1>, i32)
1616

@@ -823,15 +823,15 @@ define <vscale x 1 x half> @zero_strided_unmasked_vpload_nxv1f16(ptr %ptr) {
823823
ret <vscale x 1 x half> %load
824824
}
825825

826-
define <vscale x 1 x i64> @zero_strided_vadd.vx(<vscale x 1 x i64> %v, ptr %ptr) {
827-
; CHECK-RV32-LABEL: zero_strided_vadd.vx:
826+
define <vscale x 1 x i64> @zero_strided_vadd_nxv1i64(<vscale x 1 x i64> %v, ptr %ptr) {
827+
; CHECK-RV32-LABEL: zero_strided_vadd_nxv1i64:
828828
; CHECK-RV32: # %bb.0:
829829
; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
830830
; CHECK-RV32-NEXT: vlse64.v v9, (a0), zero
831831
; CHECK-RV32-NEXT: vadd.vv v8, v8, v9
832832
; CHECK-RV32-NEXT: ret
833833
;
834-
; CHECK-RV64-LABEL: zero_strided_vadd.vx:
834+
; CHECK-RV64-LABEL: zero_strided_vadd_nxv1i64:
835835
; CHECK-RV64: # %bb.0:
836836
; CHECK-RV64-NEXT: ld a0, 0(a0)
837837
; CHECK-RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma
@@ -842,3 +842,69 @@ define <vscale x 1 x i64> @zero_strided_vadd.vx(<vscale x 1 x i64> %v, ptr %ptr)
842842
%w = add <vscale x 1 x i64> %v, %load
843843
ret <vscale x 1 x i64> %w
844844
}
845+
846+
define <vscale x 16 x i64> @zero_strided_vadd_nxv16i64(<vscale x 16 x i64> %v, ptr %ptr) {
847+
; CHECK-RV32-LABEL: zero_strided_vadd_nxv16i64:
848+
; CHECK-RV32: # %bb.0:
849+
; CHECK-RV32-NEXT: csrr a1, vlenb
850+
; CHECK-RV32-NEXT: srli a2, a1, 3
851+
; CHECK-RV32-NEXT: sub a3, a2, a1
852+
; CHECK-RV32-NEXT: sltu a4, a2, a3
853+
; CHECK-RV32-NEXT: addi a4, a4, -1
854+
; CHECK-RV32-NEXT: and a3, a4, a3
855+
; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
856+
; CHECK-RV32-NEXT: vlse64.v v24, (a0), zero
857+
; CHECK-RV32-NEXT: bltu a2, a1, .LBB55_2
858+
; CHECK-RV32-NEXT: # %bb.1:
859+
; CHECK-RV32-NEXT: mv a2, a1
860+
; CHECK-RV32-NEXT: .LBB55_2:
861+
; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
862+
; CHECK-RV32-NEXT: vlse64.v v0, (a0), zero
863+
; CHECK-RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
864+
; CHECK-RV32-NEXT: vadd.vv v16, v16, v24
865+
; CHECK-RV32-NEXT: vadd.vv v8, v8, v0
866+
; CHECK-RV32-NEXT: ret
867+
;
868+
; CHECK-RV64-LABEL: zero_strided_vadd_nxv16i64:
869+
; CHECK-RV64: # %bb.0:
870+
; CHECK-RV64-NEXT: ld a0, 0(a0)
871+
; CHECK-RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
872+
; CHECK-RV64-NEXT: vadd.vx v8, v8, a0
873+
; CHECK-RV64-NEXT: vadd.vx v16, v16, a0
874+
; CHECK-RV64-NEXT: ret
875+
%vscale = call i32 @llvm.vscale()
876+
%load = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i32(ptr %ptr, i32 0, <vscale x 16 x i1> splat (i1 true), i32 %vscale)
877+
%w = add <vscale x 16 x i64> %v, %load
878+
ret <vscale x 16 x i64> %w
879+
}
880+
881+
define <vscale x 1 x ptr> @zero_strided_vadd_nxv1p0(<vscale x 1 x ptr> %v, ptr %ptr) {
882+
; CHECK-OPT-RV32-LABEL: zero_strided_vadd_nxv1p0:
883+
; CHECK-OPT-RV32: # %bb.0:
884+
; CHECK-OPT-RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
885+
; CHECK-OPT-RV32-NEXT: vlse32.v v8, (a0), zero
886+
; CHECK-OPT-RV32-NEXT: ret
887+
;
888+
; CHECK-OPT-RV64-LABEL: zero_strided_vadd_nxv1p0:
889+
; CHECK-OPT-RV64: # %bb.0:
890+
; CHECK-OPT-RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma
891+
; CHECK-OPT-RV64-NEXT: vlse64.v v8, (a0), zero
892+
; CHECK-OPT-RV64-NEXT: ret
893+
;
894+
; CHECK-NO-OPT-RV32-LABEL: zero_strided_vadd_nxv1p0:
895+
; CHECK-NO-OPT-RV32: # %bb.0:
896+
; CHECK-NO-OPT-RV32-NEXT: lw a0, 0(a0)
897+
; CHECK-NO-OPT-RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
898+
; CHECK-NO-OPT-RV32-NEXT: vmv.v.x v8, a0
899+
; CHECK-NO-OPT-RV32-NEXT: ret
900+
;
901+
; CHECK-NO-OPT-RV64-LABEL: zero_strided_vadd_nxv1p0:
902+
; CHECK-NO-OPT-RV64: # %bb.0:
903+
; CHECK-NO-OPT-RV64-NEXT: ld a0, 0(a0)
904+
; CHECK-NO-OPT-RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma
905+
; CHECK-NO-OPT-RV64-NEXT: vmv.v.x v8, a0
906+
; CHECK-NO-OPT-RV64-NEXT: ret
907+
%vscale = call i32 @llvm.vscale()
908+
%load = call <vscale x 1 x ptr> @llvm.experimental.vp.strided.load.nxv1p0.p0.i32(ptr %ptr, i32 0, <vscale x 1 x i1> splat (i1 true), i32 %vscale)
909+
ret <vscale x 1 x ptr> %load
910+
}

0 commit comments

Comments
 (0)