[RISCV] Prefer strided load for interleave load with only one lane active (#115069)

preames · web-flow · commit a905203b9ea5 · 2024-11-05T16:15:20.000-08:00
If only one of the elements is actually used, then we can legally use a
strided load in place of the segment load. Doing so reduces vector
register pressure, so if both segment and strided are believed to be
element/segment at a time, then prefer the strided load variant.

Note that I've seen the vectorizer emitting wide interleave loads to
represent a strided load, so this does happen in practice. It doesn't
matter much for small LMUL*NF, but at large NF can start causing
problems in register allocation.

Note that this patch only covers the fixed vector formation cases. In
theory, we should do the same patch for scalable, but we can currently
only represent NF2 in scalable IR, and NF2 is assumed to be optimized to
better than segment-at-a-time by default, so there's currently nothing
to do.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21585,6 +21585,8 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
 bool RISCVTargetLowering::lowerInterleavedLoad(
     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
+  assert(Indices.size() == Shuffles.size());
+
   IRBuilder<> Builder(LI);
 
   auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
@@ -21595,6 +21597,27 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
 
   auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
 
+  // If the segment load is going to be performed segment at a time anyways
+  // and there's only one element used, use a strided load instead.  This
+  // will be equally fast, and create less vector register pressure.
+  if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) {
+    unsigned ScalarSizeInBytes = VTy->getScalarSizeInBits() / 8;
+    Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
+    Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
+    Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
+    Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
+    Value *VL = Builder.getInt32(VTy->getNumElements());
+
+    CallInst *CI =
+        Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
+                                {VTy, BasePtr->getType(), Stride->getType()},
+                                {BasePtr, Stride, Mask, VL});
+    CI->addParamAttr(
+        0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
+    Shuffles[0]->replaceAllUsesWith(CI);
+    return true;
+  };
+
   Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
 
   CallInst *VlsegN = Builder.CreateIntrinsic(
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -238,6 +238,27 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     return hasVInstructions() ? MaxInterleaveFactor : 1;
   }
 
+  bool hasOptimizedSegmentLoadStore(unsigned NF) const {
+    switch (NF) {
+    case 2:
+      return hasOptimizedNF2SegmentLoadStore();
+    case 3:
+      return hasOptimizedNF3SegmentLoadStore();
+    case 4:
+      return hasOptimizedNF4SegmentLoadStore();
+    case 5:
+      return hasOptimizedNF5SegmentLoadStore();
+    case 6:
+      return hasOptimizedNF6SegmentLoadStore();
+    case 7:
+      return hasOptimizedNF7SegmentLoadStore();
+    case 8:
+      return hasOptimizedNF8SegmentLoadStore();
+    default:
+      llvm_unreachable("Unexpected NF");
+    }
+  }
+
   // Returns VLEN divided by DLEN. Where DLEN is the datapath width of the
   // vector hardware implementation which may be less than VLEN.
   unsigned getDLenFactor() const {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -716,28 +716,6 @@ RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
   return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
 }
 
-static bool hasOptimizedSegmentLoadStore(unsigned NF,
-                                         const RISCVSubtarget *ST) {
-  switch (NF) {
-  case 2:
-    return ST->hasOptimizedNF2SegmentLoadStore();
-  case 3:
-    return ST->hasOptimizedNF3SegmentLoadStore();
-  case 4:
-    return ST->hasOptimizedNF4SegmentLoadStore();
-  case 5:
-    return ST->hasOptimizedNF5SegmentLoadStore();
-  case 6:
-    return ST->hasOptimizedNF6SegmentLoadStore();
-  case 7:
-    return ST->hasOptimizedNF7SegmentLoadStore();
-  case 8:
-    return ST->hasOptimizedNF8SegmentLoadStore();
-  default:
-    llvm_unreachable("Unexpected NF");
-  }
-}
-
 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -761,7 +739,7 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
 
         // Some processors optimize segment loads/stores as one wide memory op +
         // Factor * LMUL shuffle ops.
-        if (hasOptimizedSegmentLoadStore(Factor, ST)) {
+        if (ST->hasOptimizedSegmentLoadStore(Factor)) {
           InstructionCost Cost =
               getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
           MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -1202,8 +1202,9 @@ define <4 x i32> @load_factor2_one_active(ptr %ptr) {
 define <4 x i32> @load_factor3_one_active(ptr %ptr) {
 ; CHECK-LABEL: load_factor3_one_active:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 12
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v8, (a0)
+; CHECK-NEXT:    vlse32.v v8, (a0), a1
 ; CHECK-NEXT:    ret
   %interleaved.vec = load <12 x i32>, ptr %ptr
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -1213,8 +1214,9 @@ define <4 x i32> @load_factor3_one_active(ptr %ptr) {
 define <4 x i32> @load_factor4_one_active(ptr %ptr) {
 ; CHECK-LABEL: load_factor4_one_active:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v8, (a0)
+; CHECK-NEXT:    vlse32.v v8, (a0), a1
 ; CHECK-NEXT:    ret
   %interleaved.vec = load <16 x i32>, ptr %ptr
   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -1224,8 +1226,9 @@ define <4 x i32> @load_factor4_one_active(ptr %ptr) {
 define <4 x i32> @load_factor5_one_active(ptr %ptr) {
 ; CHECK-LABEL: load_factor5_one_active:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 20
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v8, (a0)
+; CHECK-NEXT:    vlse32.v v8, (a0), a1
 ; CHECK-NEXT:    ret
   %interleaved.vec = load <20 x i32>, ptr %ptr
   %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
@@ -1235,30 +1238,35 @@ define <4 x i32> @load_factor5_one_active(ptr %ptr) {
 define <2 x i16> @load_factor6_one_active(ptr %ptr) {
 ; CHECK-LABEL: load_factor6_one_active:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, 10
+; CHECK-NEXT:    li a1, 12
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v8, (a0)
+; CHECK-NEXT:    vlse16.v v8, (a0), a1
 ; CHECK-NEXT:    ret
   %interleaved.vec = load <12 x i16>, ptr %ptr
-  %v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 0, i32 6>
+  %v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 5, i32 11>
   ret <2 x i16> %v0
 }
 
 define <4 x i8> @load_factor7_one_active(ptr %ptr) vscale_range(8,1024) {
 ; CHECK-LABEL: load_factor7_one_active:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    li a1, 7
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v8, (a0)
+; CHECK-NEXT:    vlse8.v v8, (a0), a1
 ; CHECK-NEXT:    ret
   %interleaved.vec = load <32 x i8>, ptr %ptr
-  %v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
+  %v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
   ret <4 x i8> %v0
 }
 
 define <4 x i8> @load_factor8_one_active(ptr %ptr) vscale_range(8,1024) {
 ; CHECK-LABEL: load_factor8_one_active:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 8
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v8, (a0)
+; CHECK-NEXT:    vlse8.v v8, (a0), a1
 ; CHECK-NEXT:    ret
   %interleaved.vec = load <32 x i8>, ptr %ptr
   %v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>