Skip to content

Commit a905203

Browse files
authored
[RISCV] Prefer strided load for interleave load with only one lane active (#115069)
If only one of the elements is actually used, then we can legally use a strided load in place of the segment load. Doing so reduces vector register pressure, so if both segment and strided are believed to be element/segment at a time, then prefer the strided load variant. Note that I've seen the vectorizer emitting wide interleave loads to represent a strided load, so this does happen in practice. It doesn't matter much for small LMUL*NF, but at large NF can start causing problems in register allocation. Note that this patch only covers the fixed vector formation cases. In theory, we should do the same patch for scalable, but we can currently only represent NF2 in scalable IR, and NF2 is assumed to be optimized to better than segment-at-a-time by default, so there's currently nothing to do.
1 parent 332fda8 commit a905203

File tree

4 files changed

+61
-31
lines changed

4 files changed

+61
-31
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21585,6 +21585,8 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
2158521585
bool RISCVTargetLowering::lowerInterleavedLoad(
2158621586
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
2158721587
ArrayRef<unsigned> Indices, unsigned Factor) const {
21588+
assert(Indices.size() == Shuffles.size());
21589+
2158821590
IRBuilder<> Builder(LI);
2158921591

2159021592
auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
@@ -21595,6 +21597,27 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
2159521597

2159621598
auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
2159721599

21600+
// If the segment load is going to be performed segment at a time anyways
21601+
// and there's only one element used, use a strided load instead. This
21602+
// will be equally fast, and create less vector register pressure.
21603+
if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) {
21604+
unsigned ScalarSizeInBytes = VTy->getScalarSizeInBits() / 8;
21605+
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
21606+
Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
21607+
Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
21608+
Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
21609+
Value *VL = Builder.getInt32(VTy->getNumElements());
21610+
21611+
CallInst *CI =
21612+
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
21613+
{VTy, BasePtr->getType(), Stride->getType()},
21614+
{BasePtr, Stride, Mask, VL});
21615+
CI->addParamAttr(
21616+
0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
21617+
Shuffles[0]->replaceAllUsesWith(CI);
21618+
return true;
21619+
};
21620+
2159821621
Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
2159921622

2160021623
CallInst *VlsegN = Builder.CreateIntrinsic(

llvm/lib/Target/RISCV/RISCVSubtarget.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,27 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
238238
return hasVInstructions() ? MaxInterleaveFactor : 1;
239239
}
240240

241+
bool hasOptimizedSegmentLoadStore(unsigned NF) const {
242+
switch (NF) {
243+
case 2:
244+
return hasOptimizedNF2SegmentLoadStore();
245+
case 3:
246+
return hasOptimizedNF3SegmentLoadStore();
247+
case 4:
248+
return hasOptimizedNF4SegmentLoadStore();
249+
case 5:
250+
return hasOptimizedNF5SegmentLoadStore();
251+
case 6:
252+
return hasOptimizedNF6SegmentLoadStore();
253+
case 7:
254+
return hasOptimizedNF7SegmentLoadStore();
255+
case 8:
256+
return hasOptimizedNF8SegmentLoadStore();
257+
default:
258+
llvm_unreachable("Unexpected NF");
259+
}
260+
}
261+
241262
// Returns VLEN divided by DLEN. Where DLEN is the datapath width of the
242263
// vector hardware implementation which may be less than VLEN.
243264
unsigned getDLenFactor() const {

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -716,28 +716,6 @@ RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
716716
return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
717717
}
718718

719-
static bool hasOptimizedSegmentLoadStore(unsigned NF,
720-
const RISCVSubtarget *ST) {
721-
switch (NF) {
722-
case 2:
723-
return ST->hasOptimizedNF2SegmentLoadStore();
724-
case 3:
725-
return ST->hasOptimizedNF3SegmentLoadStore();
726-
case 4:
727-
return ST->hasOptimizedNF4SegmentLoadStore();
728-
case 5:
729-
return ST->hasOptimizedNF5SegmentLoadStore();
730-
case 6:
731-
return ST->hasOptimizedNF6SegmentLoadStore();
732-
case 7:
733-
return ST->hasOptimizedNF7SegmentLoadStore();
734-
case 8:
735-
return ST->hasOptimizedNF8SegmentLoadStore();
736-
default:
737-
llvm_unreachable("Unexpected NF");
738-
}
739-
}
740-
741719
InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
742720
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
743721
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -761,7 +739,7 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
761739

762740
// Some processors optimize segment loads/stores as one wide memory op +
763741
// Factor * LMUL shuffle ops.
764-
if (hasOptimizedSegmentLoadStore(Factor, ST)) {
742+
if (ST->hasOptimizedSegmentLoadStore(Factor)) {
765743
InstructionCost Cost =
766744
getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
767745
MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1202,8 +1202,9 @@ define <4 x i32> @load_factor2_one_active(ptr %ptr) {
12021202
define <4 x i32> @load_factor3_one_active(ptr %ptr) {
12031203
; CHECK-LABEL: load_factor3_one_active:
12041204
; CHECK: # %bb.0:
1205+
; CHECK-NEXT: li a1, 12
12051206
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1206-
; CHECK-NEXT: vlseg3e32.v v8, (a0)
1207+
; CHECK-NEXT: vlse32.v v8, (a0), a1
12071208
; CHECK-NEXT: ret
12081209
%interleaved.vec = load <12 x i32>, ptr %ptr
12091210
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -1213,8 +1214,9 @@ define <4 x i32> @load_factor3_one_active(ptr %ptr) {
12131214
define <4 x i32> @load_factor4_one_active(ptr %ptr) {
12141215
; CHECK-LABEL: load_factor4_one_active:
12151216
; CHECK: # %bb.0:
1217+
; CHECK-NEXT: li a1, 16
12161218
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1217-
; CHECK-NEXT: vlseg4e32.v v8, (a0)
1219+
; CHECK-NEXT: vlse32.v v8, (a0), a1
12181220
; CHECK-NEXT: ret
12191221
%interleaved.vec = load <16 x i32>, ptr %ptr
12201222
%v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -1224,8 +1226,9 @@ define <4 x i32> @load_factor4_one_active(ptr %ptr) {
12241226
define <4 x i32> @load_factor5_one_active(ptr %ptr) {
12251227
; CHECK-LABEL: load_factor5_one_active:
12261228
; CHECK: # %bb.0:
1229+
; CHECK-NEXT: li a1, 20
12271230
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1228-
; CHECK-NEXT: vlseg5e32.v v8, (a0)
1231+
; CHECK-NEXT: vlse32.v v8, (a0), a1
12291232
; CHECK-NEXT: ret
12301233
%interleaved.vec = load <20 x i32>, ptr %ptr
12311234
%v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
@@ -1235,30 +1238,35 @@ define <4 x i32> @load_factor5_one_active(ptr %ptr) {
12351238
define <2 x i16> @load_factor6_one_active(ptr %ptr) {
12361239
; CHECK-LABEL: load_factor6_one_active:
12371240
; CHECK: # %bb.0:
1241+
; CHECK-NEXT: addi a0, a0, 10
1242+
; CHECK-NEXT: li a1, 12
12381243
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
1239-
; CHECK-NEXT: vlseg6e16.v v8, (a0)
1244+
; CHECK-NEXT: vlse16.v v8, (a0), a1
12401245
; CHECK-NEXT: ret
12411246
%interleaved.vec = load <12 x i16>, ptr %ptr
1242-
%v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 0, i32 6>
1247+
%v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 5, i32 11>
12431248
ret <2 x i16> %v0
12441249
}
12451250

12461251
define <4 x i8> @load_factor7_one_active(ptr %ptr) vscale_range(8,1024) {
12471252
; CHECK-LABEL: load_factor7_one_active:
12481253
; CHECK: # %bb.0:
1254+
; CHECK-NEXT: addi a0, a0, 1
1255+
; CHECK-NEXT: li a1, 7
12491256
; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
1250-
; CHECK-NEXT: vlseg7e8.v v8, (a0)
1257+
; CHECK-NEXT: vlse8.v v8, (a0), a1
12511258
; CHECK-NEXT: ret
12521259
%interleaved.vec = load <32 x i8>, ptr %ptr
1253-
%v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
1260+
%v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
12541261
ret <4 x i8> %v0
12551262
}
12561263

12571264
define <4 x i8> @load_factor8_one_active(ptr %ptr) vscale_range(8,1024) {
12581265
; CHECK-LABEL: load_factor8_one_active:
12591266
; CHECK: # %bb.0:
1267+
; CHECK-NEXT: li a1, 8
12601268
; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
1261-
; CHECK-NEXT: vlseg8e8.v v8, (a0)
1269+
; CHECK-NEXT: vlse8.v v8, (a0), a1
12621270
; CHECK-NEXT: ret
12631271
%interleaved.vec = load <32 x i8>, ptr %ptr
12641272
%v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>

0 commit comments

Comments
 (0)