-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[RISCV][IA] Use strided load for one active deinterleaveN(load) #148892
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV][IA] Use strided load for one active deinterleaveN(load) #148892
Conversation
This adds the analogous handling we use for the shuffle lowering to the deinterleaveN intrinsic path.
@llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesThis adds the analogous handling we use for the shuffle lowering to the deinterleaveN intrinsic path. Full diff: https://github.com/llvm/llvm-project/pull/148892.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 39603b92cc2f7..7dac87b07e990 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -243,20 +243,44 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
assert(LI->isSimple());
IRBuilder<> Builder(LI);
- Value *FirstActive =
- *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
- VectorType *ResVTy = cast<VectorType>(FirstActive->getType());
+ auto FirstActiveItr =
+ llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
+ VectorType *ResVTy = cast<VectorType>((*FirstActiveItr)->getType());
const DataLayout &DL = LI->getDataLayout();
-
if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
LI->getPointerAddressSpace(), DL))
return false;
- Value *Return;
Type *PtrTy = LI->getPointerOperandType();
Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
+ // If the segment load is going to be performed segment at a time anyways
+ // and there's only one element used, use a strided load instead. This
+ // will be equally fast, and create less vector register pressure.
+ if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) &&
+ 1 == llvm::count_if(DeinterleaveValues,
+ [](Value *V) { return V != nullptr; })) {
+ unsigned Idx = std::distance(DeinterleaveValues.begin(), FirstActiveItr);
+ unsigned ScalarSizeInBytes = DL.getTypeStoreSize(ResVTy->getElementType());
+ Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
+ Value *Offset = ConstantInt::get(XLenTy, Idx * ScalarSizeInBytes);
+ Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
+ Value *Mask = Builder.getAllOnesMask(ResVTy->getElementCount());
+ Type *I32 = Type::getIntNTy(LI->getContext(), 32);
+ Value *VL = Builder.CreateElementCount(I32, ResVTy->getElementCount());
+
+ CallInst *CI =
+ Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
+ {ResVTy, BasePtr->getType(), Stride->getType()},
+ {BasePtr, Stride, Mask, VL});
+ Align A = commonAlignment(LI->getAlign(), Idx * ScalarSizeInBytes);
+ CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), A));
+ (*FirstActiveItr)->replaceAllUsesWith(CI);
+ return true;
+ }
+
+ Value *Return;
if (isa<FixedVectorType>(ResVTy)) {
Value *VL = Builder.CreateElementCount(XLenTy, ResVTy->getElementCount());
Value *Mask = Builder.getAllOnesMask(ResVTy->getElementCount());
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 9af92aa995f1f..e28428224c2ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -407,8 +407,9 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT: vlseg4e8.v v8, (a0)
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlse8.v v8, (a0), a1
; CHECK-NEXT: ret
%vec = load <vscale x 32 x i8>, ptr %p
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
@@ -419,8 +420,10 @@ define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) {
define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive2(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT: vlseg4e8.v v5, (a0)
+; CHECK-NEXT: addi a0, a0, 3
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlse8.v v8, (a0), a1
; CHECK-NEXT: ret
%vec = load <vscale x 32 x i8>, ptr %p
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
|
Type *PtrTy = LI->getPointerOperandType(); | ||
Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); | ||
|
||
// If the segment load is going to be performed segment at a time anyways |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// If the segment load is going to be performed segment at a time anyways | |
// If the segment load is going to be performed one segment at a time anyways |
Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); | ||
|
||
// If the segment load is going to be performed segment at a time anyways | ||
// and there's only one element used, use a strided load instead. This |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
element -> field?
|
||
// If the segment load is going to be performed segment at a time anyways | ||
// and there's only one element used, use a strided load instead. This | ||
// will be equally fast, and create less vector register pressure. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// will be equally fast, and create less vector register pressure. | |
// will be equally fast, and creates less vector register pressure. |
Value *Offset = ConstantInt::get(XLenTy, Idx * ScalarSizeInBytes); | ||
Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); | ||
Value *Mask = Builder.getAllOnesMask(ResVTy->getElementCount()); | ||
Type *I32 = Type::getIntNTy(LI->getContext(), 32); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Type::getInt32Ty?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
1 == llvm::count_if(DeinterleaveValues, | ||
[](Value *V) { return V != nullptr; })) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit
1 == llvm::count_if(DeinterleaveValues, | |
[](Value *V) { return V != nullptr; })) { | |
llvm::count_if(DeinterleaveValues, | |
[](Value *V) { return V != nullptr; }) == 1) { |
This patch conflicts with #148716. As I was already planning on reworking this to be a combine eventually, I'm going to close this for the moment, and post something new once the API has stabilized a bit. |
This adds the analogous handling we use for the shuffle lowering to the deinterleaveN intrinsic path.