-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[IA] Relax the requirement of having ExtractValue users on deinterleave intrinsic #148716
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[IA] Relax the requirement of having ExtractValue users on deinterleave intrinsic #148716
Conversation
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-backend-risc-v Author: Min-Yih Hsu (mshockwave) ChangesThere are cases where InstCombine / InstSimplify might sink extractvalue instructions that use a deinterleave intrinsic into successor blocks, which prevents InterleavedAccess from kicking in because extractvalue + deinterleave is currently the only pattern IA recognizes w.r.t. deinterleave intrinsic, while we could have just replaced the users of deinterleave intrinsic with whatever generated by the target TLI hooks. This strategy works perfectly for In addition to the challenge of Patch is 44.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/148716.diff 14 Files Affected:
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index af1e0d7251a4f..8d7407949b3f0 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -23,6 +23,7 @@
#include "llvm/Support/Compiler.h"
namespace llvm {
+class IntrinsicInst;
class TargetLibraryInfo;
/// The Vector Function Database.
@@ -188,6 +189,25 @@ LLVM_ABI unsigned getInterleaveIntrinsicFactor(Intrinsic::ID ID);
/// Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID);
+/// A vector can either be deinterleaved through an intrinsic or a combination
+/// of shufflevector instructions. This is a thin abstraction layer to provide
+/// some common information like the deinterleaving factor.
+struct VectorDeinterleaving {
+ IntrinsicInst *DI = nullptr;
+ ArrayRef<Value *> Values;
+
+ unsigned getFactor() const;
+
+ Type *getDeinterleavedType() const;
+
+ explicit VectorDeinterleaving(IntrinsicInst *DI) : DI(DI) {}
+ explicit VectorDeinterleaving(ArrayRef<Value *> Values) : Values(Values) {}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ bool isValid() const { return (DI != nullptr) ^ !Values.empty(); }
+#endif
+};
+
/// Given a vector and an element number, see if the scalar value is
/// already around as a register, for example if it were inserted then extracted
/// from the vector.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a248eb7444b20..b6e0fa13a99f6 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -96,6 +96,7 @@ class TargetRegisterClass;
class TargetRegisterInfo;
class TargetTransformInfo;
class Value;
+struct VectorDeinterleaving;
class VPIntrinsic;
namespace Sched {
@@ -3228,9 +3229,10 @@ class LLVM_ABI TargetLoweringBase {
///
/// \p Load is a vp.load instruction.
/// \p Mask is a mask value
- /// \p DeinterleaveRes is a list of deinterleaved results.
+ /// \p VD represents either a deinterleave intrinsic or a list of
+ /// deinterleaved values.
virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveRes) const {
+ const VectorDeinterleaving &VD) const {
return false;
}
@@ -3250,10 +3252,9 @@ class LLVM_ABI TargetLoweringBase {
/// llvm.vector.deinterleave{2,3,5,7}
///
/// \p LI is the accompanying load instruction.
- /// \p DeinterleaveValues contains the deinterleaved values.
- virtual bool
- lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
- ArrayRef<Value *> DeinterleaveValues) const {
+ /// \p DI represents the deinterleave intrinsic.
+ virtual bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
+ IntrinsicInst *DI) const {
return false;
}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 7f0ed0b60a785..cd2ab0edf6fd3 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -306,6 +306,25 @@ unsigned llvm::getDeinterleaveIntrinsicFactor(Intrinsic::ID ID) {
}
}
+unsigned VectorDeinterleaving::getFactor() const {
+ assert(isValid());
+ if (DI)
+ return getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+ else
+ return Values.size();
+}
+
+Type *VectorDeinterleaving::getDeinterleavedType() const {
+ assert(getFactor() > 0);
+ if (DI) {
+ return *DI->getType()->subtype_begin();
+ } else {
+ Value *FirstActive =
+ *llvm::find_if(Values, [](Value *V) { return V != nullptr; });
+ return FirstActive->getType();
+ }
+}
+
/// Given a vector and an element number, see if the scalar value is
/// already around as a register, for example if it were inserted then extracted
/// from the vector.
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 7259834975cf4..1363e8b6b28b6 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -381,7 +381,8 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
- if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
+ VectorDeinterleaving VD(ShuffleValues);
+ if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, VD))
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
} else {
@@ -615,32 +616,17 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
return false;
- const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+ VectorDeinterleaving VD(DI);
+ const unsigned Factor = VD.getFactor();
assert(Factor && "unexpected deinterleave intrinsic");
- SmallVector<Value *, 8> DeinterleaveValues(Factor, nullptr);
- Value *LastFactor = nullptr;
- for (auto *User : DI->users()) {
- auto *Extract = dyn_cast<ExtractValueInst>(User);
- if (!Extract || Extract->getNumIndices() != 1)
- return false;
- unsigned Idx = Extract->getIndices()[0];
- if (DeinterleaveValues[Idx])
- return false;
- DeinterleaveValues[Idx] = Extract;
- LastFactor = Extract;
- }
-
- if (!LastFactor)
- return false;
-
if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
return false;
// Check mask operand. Handle both all-true/false and interleaved mask.
Value *WideMask = VPLoad->getOperand(1);
Value *Mask =
- getMask(WideMask, Factor, cast<VectorType>(LastFactor->getType()));
+ getMask(WideMask, Factor, cast<VectorType>(VD.getDeinterleavedType()));
if (!Mask)
return false;
@@ -649,7 +635,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
// Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
// TLI function to emit target-specific interleaved instruction.
- if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
+ VectorDeinterleaving VD(DI);
+ if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, VD))
return false;
} else {
@@ -661,13 +648,10 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
<< " and factor = " << Factor << "\n");
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DI))
return false;
}
- for (Value *V : DeinterleaveValues)
- if (V)
- DeadInsts.insert(cast<Instruction>(V));
DeadInsts.insert(DI);
// We now have a target-specific load, so delete the old one.
DeadInsts.insert(cast<Instruction>(LoadedVal));
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bde4ba993f69e..11ae8605c5a71 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17476,16 +17476,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- LoadInst *LI, ArrayRef<Value *> DeinterleavedValues) const {
- unsigned Factor = DeinterleavedValues.size();
+ LoadInst *LI, IntrinsicInst *DI) const {
+ VectorDeinterleaving VD(DI);
+ const unsigned Factor = VD.getFactor();
if (Factor != 2 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
return false;
}
- Value *FirstActive = *llvm::find_if(DeinterleavedValues,
- [](Value *V) { return V != nullptr; });
- VectorType *VTy = cast<VectorType>(FirstActive->getType());
+ VectorType *VTy = cast<VectorType>(VD.getDeinterleavedType());
const DataLayout &DL = LI->getModule()->getDataLayout();
bool UseScalable;
@@ -17513,7 +17512,10 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
Value *BaseAddr = LI->getPointerOperand();
+ Value *Result = nullptr;
if (NumLoads > 1) {
+ Result = PoisonValue::get(DI->getType());
+
// Create multiple legal small ldN.
SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
for (unsigned I = 0; I < NumLoads; ++I) {
@@ -17533,25 +17535,19 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
}
- // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
- for (unsigned J = 0; J < Factor; ++J) {
- if (DeinterleavedValues[J])
- DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
- }
+
+ // Merge the values from different factors.
+ for (unsigned J = 0; J < Factor; ++J)
+ Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
} else {
- Value *Result;
if (UseScalable)
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
- // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
- for (unsigned I = 0; I < Factor; I++) {
- if (DeinterleavedValues[I]) {
- Value *NewExtract = Builder.CreateExtractValue(Result, I);
- DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
- }
- }
}
+
+ // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
+ DI->replaceAllUsesWith(Result);
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 65fe08e92c235..ca40884e3a5c0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -218,8 +218,8 @@ class AArch64TargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(
- LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
+ IntrinsicInst *DI) const override;
bool lowerInterleaveIntrinsicToStore(
StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 00e969056df7d..743745ba5d65f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -437,14 +437,14 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(
- LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
+ IntrinsicInst *DI) const override;
bool lowerInterleaveIntrinsicToStore(
StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveRes) const override;
+ const VectorDeinterleaving &VD) const override;
bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index a6ff22c4b391f..044b1eae84f26 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -14,6 +14,7 @@
#include "RISCVISelLowering.h"
#include "RISCVSubtarget.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
@@ -233,17 +234,17 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
- LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const {
- const unsigned Factor = DeinterleaveValues.size();
+ LoadInst *LI, IntrinsicInst *DI) const {
+ VectorDeinterleaving VD(DI);
+ const unsigned Factor = VD.getFactor();
+ assert(Factor && "unexpected deinterleaving factor");
if (Factor > 8)
return false;
assert(LI->isSimple());
IRBuilder<> Builder(LI);
- Value *FirstActive =
- *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
- VectorType *ResVTy = cast<VectorType>(FirstActive->getType());
+ VectorType *ResVTy = cast<VectorType>(VD.getDeinterleavedType());
const DataLayout &DL = LI->getDataLayout();
@@ -293,16 +294,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
}
- for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) {
- if (!DIV)
- continue;
- // We have to create a brand new ExtractValue to replace each
- // of these old ExtractValue instructions.
- Value *NewEV =
- Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
- DIV->replaceAllUsesWith(NewEV);
- }
-
+ DI->replaceAllUsesWith(Return);
return true;
}
@@ -419,16 +411,14 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
/// dealing with factor of 2 (extractvalue is still required for most of other
/// factors though).
bool RISCVTargetLowering::lowerInterleavedVPLoad(
- VPIntrinsic *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveResults) const {
- const unsigned Factor = DeinterleaveResults.size();
+ VPIntrinsic *Load, Value *Mask, const VectorDeinterleaving &VD) const {
assert(Mask && "Expect a valid mask");
assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
"Unexpected intrinsic");
- Value *FirstActive = *llvm::find_if(DeinterleaveResults,
- [](Value *V) { return V != nullptr; });
- VectorType *VTy = cast<VectorType>(FirstActive->getType());
+ const unsigned Factor = VD.getFactor();
+ assert(Factor && "unexpected vector deinterleaving");
+ VectorType *VTy = cast<VectorType>(VD.getDeinterleavedType());
auto &DL = Load->getModule()->getDataLayout();
Align Alignment = Load->getParamAlign(0).value_or(
@@ -494,14 +484,18 @@ bool RISCVTargetLowering::lowerInterleavedVPLoad(
}
}
- for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
- if (!DIO)
- continue;
- // We have to create a brand new ExtractValue to replace each
- // of these old ExtractValue instructions.
- Value *NewEV =
- Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
- DIO->replaceAllUsesWith(NewEV);
+ if (VD.DI) {
+ VD.DI->replaceAllUsesWith(Return);
+ } else {
+ for (auto [Idx, DIO] : enumerate(VD.Values)) {
+ if (!DIO)
+ continue;
+ // We have to create a brand new ExtractValue to replace each
+ // of these old ExtractValue instructions.
+ Value *NewEV =
+ Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
+ DIO->replaceAllUsesWith(NewEV);
+ }
}
return true;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 3e822d357b667..861423026b2eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -274,6 +274,59 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p
ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
}
+define { <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_partial(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor3_partial:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vlseg3e8.v v7, (a0)
+; CHECK-NEXT: vmv1r.v v8, v7
+; CHECK-NEXT: ret
+ %vec = load <24 x i8>, ptr %p
+ %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec)
+ %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0
+ %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
+ %res0 = insertvalue { <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+ %res1 = insertvalue { <8 x i8>, <8 x i8> } %res0, <8 x i8> %t2, 1
+ ret { <8 x i8>, <8 x i8> } %res1
+}
+
+; InterleavedAccess should kick in even if the users of deinterleave intrinsic is not extractvalue.
+define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_no_extract(ptr %p, ptr %p1, i1 %c) {
+; CHECK-LABEL: vector_deinterleave_load_factor3_no_extract:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a2, a2, 1
+; CHECK-NEXT: beqz a2, .LBB17_2
+; CHECK-NEXT: # %bb.1: # %bb0
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vlseg3e8.v v6, (a0)
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB17_2: # %bb1
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vlseg3e8.v v6, (a1)
+; CHECK-NEXT: ret
+ br i1 %c, label %bb0, label %bb1
+
+bb0:
+ %vec0 = load <24 x i8>, ptr %p
+ %d0.0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec0)
+ br label %merge
+
+bb1:
+ %vec1 = load <24 x i8>, ptr %p1
+ %d0.1 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec1)
+ br label %merge
+
+merge:
+ %d0 = phi {<8 x i8>, <8 x i8>, <8 x i8>} [%d0.0, %bb0], [%d0.1, %bb1]
+ %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0
+ %t1 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 1
+ %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
+ %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+ %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 0
+ %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 0
+ ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
+}
+
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4:
; CHECK: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 7fb822d20f892..0fa8e05acc8ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -66,6 +66,115 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor
ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
}
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_partial(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor3_partial:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: lui a2, 699051
+; RV32-NEXT: addi a2, a2, -1365
+; RV32-NEXT: mulhu a1, a1, a2
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg3e32.v v7, (a0)
+; RV32-NEXT: vmv1r.v v8, v7
+; RV32-NEXT: ret
+;
+; RV64-LABEL: load_factor3_partial:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a2, a1, 1
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: lui a2, 699051
+; RV64-NEXT: addi a2, a2, -1365
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: slli a2, a2, 32
+; RV64-NEXT: mulhu a1, a1, a2
+; RV64-NEXT: srli a1, a1, 33
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg3e32.v v7, (a0)
+; RV64-NEXT: vmv1r.v v8, v7
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 3
+ %wide.masked.load = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 %rvl)
+ %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vsc...
[truncated]
|
@llvm/pr-subscribers-backend-aarch64 Author: Min-Yih Hsu (mshockwave) ChangesThere are cases where InstCombine / InstSimplify might sink extractvalue instructions that use a deinterleave intrinsic into successor blocks, which prevents InterleavedAccess from kicking in because extractvalue + deinterleave is currently the only pattern IA recognizes w.r.t. deinterleave intrinsic, while we could have just replaced the users of deinterleave intrinsic with whatever generated by the target TLI hooks. This strategy works perfectly for In addition to the challenge of Patch is 44.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/148716.diff 14 Files Affected:
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index af1e0d7251a4f..8d7407949b3f0 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -23,6 +23,7 @@
#include "llvm/Support/Compiler.h"
namespace llvm {
+class IntrinsicInst;
class TargetLibraryInfo;
/// The Vector Function Database.
@@ -188,6 +189,25 @@ LLVM_ABI unsigned getInterleaveIntrinsicFactor(Intrinsic::ID ID);
/// Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID);
+/// A vector can either be deinterleaved through an intrinsic or a combination
+/// of shufflevector instructions. This is a thin abstraction layer to provide
+/// some common information like the deinterleaving factor.
+struct VectorDeinterleaving {
+ IntrinsicInst *DI = nullptr;
+ ArrayRef<Value *> Values;
+
+ unsigned getFactor() const;
+
+ Type *getDeinterleavedType() const;
+
+ explicit VectorDeinterleaving(IntrinsicInst *DI) : DI(DI) {}
+ explicit VectorDeinterleaving(ArrayRef<Value *> Values) : Values(Values) {}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ bool isValid() const { return (DI != nullptr) ^ !Values.empty(); }
+#endif
+};
+
/// Given a vector and an element number, see if the scalar value is
/// already around as a register, for example if it were inserted then extracted
/// from the vector.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a248eb7444b20..b6e0fa13a99f6 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -96,6 +96,7 @@ class TargetRegisterClass;
class TargetRegisterInfo;
class TargetTransformInfo;
class Value;
+struct VectorDeinterleaving;
class VPIntrinsic;
namespace Sched {
@@ -3228,9 +3229,10 @@ class LLVM_ABI TargetLoweringBase {
///
/// \p Load is a vp.load instruction.
/// \p Mask is a mask value
- /// \p DeinterleaveRes is a list of deinterleaved results.
+ /// \p VD represents either a deinterleave intrinsic or a list of
+ /// deinterleaved values.
virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveRes) const {
+ const VectorDeinterleaving &VD) const {
return false;
}
@@ -3250,10 +3252,9 @@ class LLVM_ABI TargetLoweringBase {
/// llvm.vector.deinterleave{2,3,5,7}
///
/// \p LI is the accompanying load instruction.
- /// \p DeinterleaveValues contains the deinterleaved values.
- virtual bool
- lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
- ArrayRef<Value *> DeinterleaveValues) const {
+ /// \p DI represents the deinterleave intrinsic.
+ virtual bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
+ IntrinsicInst *DI) const {
return false;
}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 7f0ed0b60a785..cd2ab0edf6fd3 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -306,6 +306,25 @@ unsigned llvm::getDeinterleaveIntrinsicFactor(Intrinsic::ID ID) {
}
}
+unsigned VectorDeinterleaving::getFactor() const {
+ assert(isValid());
+ if (DI)
+ return getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+ else
+ return Values.size();
+}
+
+Type *VectorDeinterleaving::getDeinterleavedType() const {
+ assert(getFactor() > 0);
+ if (DI) {
+ return *DI->getType()->subtype_begin();
+ } else {
+ Value *FirstActive =
+ *llvm::find_if(Values, [](Value *V) { return V != nullptr; });
+ return FirstActive->getType();
+ }
+}
+
/// Given a vector and an element number, see if the scalar value is
/// already around as a register, for example if it were inserted then extracted
/// from the vector.
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 7259834975cf4..1363e8b6b28b6 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -381,7 +381,8 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
- if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
+ VectorDeinterleaving VD(ShuffleValues);
+ if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, VD))
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
} else {
@@ -615,32 +616,17 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
return false;
- const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+ VectorDeinterleaving VD(DI);
+ const unsigned Factor = VD.getFactor();
assert(Factor && "unexpected deinterleave intrinsic");
- SmallVector<Value *, 8> DeinterleaveValues(Factor, nullptr);
- Value *LastFactor = nullptr;
- for (auto *User : DI->users()) {
- auto *Extract = dyn_cast<ExtractValueInst>(User);
- if (!Extract || Extract->getNumIndices() != 1)
- return false;
- unsigned Idx = Extract->getIndices()[0];
- if (DeinterleaveValues[Idx])
- return false;
- DeinterleaveValues[Idx] = Extract;
- LastFactor = Extract;
- }
-
- if (!LastFactor)
- return false;
-
if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
return false;
// Check mask operand. Handle both all-true/false and interleaved mask.
Value *WideMask = VPLoad->getOperand(1);
Value *Mask =
- getMask(WideMask, Factor, cast<VectorType>(LastFactor->getType()));
+ getMask(WideMask, Factor, cast<VectorType>(VD.getDeinterleavedType()));
if (!Mask)
return false;
@@ -649,7 +635,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
// Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
// TLI function to emit target-specific interleaved instruction.
- if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
+ VectorDeinterleaving VD(DI);
+ if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, VD))
return false;
} else {
@@ -661,13 +648,10 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
<< " and factor = " << Factor << "\n");
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DI))
return false;
}
- for (Value *V : DeinterleaveValues)
- if (V)
- DeadInsts.insert(cast<Instruction>(V));
DeadInsts.insert(DI);
// We now have a target-specific load, so delete the old one.
DeadInsts.insert(cast<Instruction>(LoadedVal));
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bde4ba993f69e..11ae8605c5a71 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17476,16 +17476,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- LoadInst *LI, ArrayRef<Value *> DeinterleavedValues) const {
- unsigned Factor = DeinterleavedValues.size();
+ LoadInst *LI, IntrinsicInst *DI) const {
+ VectorDeinterleaving VD(DI);
+ const unsigned Factor = VD.getFactor();
if (Factor != 2 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
return false;
}
- Value *FirstActive = *llvm::find_if(DeinterleavedValues,
- [](Value *V) { return V != nullptr; });
- VectorType *VTy = cast<VectorType>(FirstActive->getType());
+ VectorType *VTy = cast<VectorType>(VD.getDeinterleavedType());
const DataLayout &DL = LI->getModule()->getDataLayout();
bool UseScalable;
@@ -17513,7 +17512,10 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
Value *BaseAddr = LI->getPointerOperand();
+ Value *Result = nullptr;
if (NumLoads > 1) {
+ Result = PoisonValue::get(DI->getType());
+
// Create multiple legal small ldN.
SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
for (unsigned I = 0; I < NumLoads; ++I) {
@@ -17533,25 +17535,19 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
}
- // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
- for (unsigned J = 0; J < Factor; ++J) {
- if (DeinterleavedValues[J])
- DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
- }
+
+ // Merge the values from different factors.
+ for (unsigned J = 0; J < Factor; ++J)
+ Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
} else {
- Value *Result;
if (UseScalable)
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
- // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
- for (unsigned I = 0; I < Factor; I++) {
- if (DeinterleavedValues[I]) {
- Value *NewExtract = Builder.CreateExtractValue(Result, I);
- DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
- }
- }
}
+
+ // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
+ DI->replaceAllUsesWith(Result);
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 65fe08e92c235..ca40884e3a5c0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -218,8 +218,8 @@ class AArch64TargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(
- LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
+ IntrinsicInst *DI) const override;
bool lowerInterleaveIntrinsicToStore(
StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 00e969056df7d..743745ba5d65f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -437,14 +437,14 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(
- LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
+ IntrinsicInst *DI) const override;
bool lowerInterleaveIntrinsicToStore(
StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveRes) const override;
+ const VectorDeinterleaving &VD) const override;
bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index a6ff22c4b391f..044b1eae84f26 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -14,6 +14,7 @@
#include "RISCVISelLowering.h"
#include "RISCVSubtarget.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
@@ -233,17 +234,17 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
- LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const {
- const unsigned Factor = DeinterleaveValues.size();
+ LoadInst *LI, IntrinsicInst *DI) const {
+ VectorDeinterleaving VD(DI);
+ const unsigned Factor = VD.getFactor();
+ assert(Factor && "unexpected deinterleaving factor");
if (Factor > 8)
return false;
assert(LI->isSimple());
IRBuilder<> Builder(LI);
- Value *FirstActive =
- *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
- VectorType *ResVTy = cast<VectorType>(FirstActive->getType());
+ VectorType *ResVTy = cast<VectorType>(VD.getDeinterleavedType());
const DataLayout &DL = LI->getDataLayout();
@@ -293,16 +294,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
}
- for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) {
- if (!DIV)
- continue;
- // We have to create a brand new ExtractValue to replace each
- // of these old ExtractValue instructions.
- Value *NewEV =
- Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
- DIV->replaceAllUsesWith(NewEV);
- }
-
+ DI->replaceAllUsesWith(Return);
return true;
}
@@ -419,16 +411,14 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
/// dealing with factor of 2 (extractvalue is still required for most of other
/// factors though).
bool RISCVTargetLowering::lowerInterleavedVPLoad(
- VPIntrinsic *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveResults) const {
- const unsigned Factor = DeinterleaveResults.size();
+ VPIntrinsic *Load, Value *Mask, const VectorDeinterleaving &VD) const {
assert(Mask && "Expect a valid mask");
assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
"Unexpected intrinsic");
- Value *FirstActive = *llvm::find_if(DeinterleaveResults,
- [](Value *V) { return V != nullptr; });
- VectorType *VTy = cast<VectorType>(FirstActive->getType());
+ const unsigned Factor = VD.getFactor();
+ assert(Factor && "unexpected vector deinterleaving");
+ VectorType *VTy = cast<VectorType>(VD.getDeinterleavedType());
auto &DL = Load->getModule()->getDataLayout();
Align Alignment = Load->getParamAlign(0).value_or(
@@ -494,14 +484,18 @@ bool RISCVTargetLowering::lowerInterleavedVPLoad(
}
}
- for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
- if (!DIO)
- continue;
- // We have to create a brand new ExtractValue to replace each
- // of these old ExtractValue instructions.
- Value *NewEV =
- Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
- DIO->replaceAllUsesWith(NewEV);
+ if (VD.DI) {
+ VD.DI->replaceAllUsesWith(Return);
+ } else {
+ for (auto [Idx, DIO] : enumerate(VD.Values)) {
+ if (!DIO)
+ continue;
+ // We have to create a brand new ExtractValue to replace each
+ // of these old ExtractValue instructions.
+ Value *NewEV =
+ Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
+ DIO->replaceAllUsesWith(NewEV);
+ }
}
return true;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 3e822d357b667..861423026b2eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -274,6 +274,59 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p
ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
}
+define { <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_partial(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor3_partial:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vlseg3e8.v v7, (a0)
+; CHECK-NEXT: vmv1r.v v8, v7
+; CHECK-NEXT: ret
+ %vec = load <24 x i8>, ptr %p
+ %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec)
+ %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0
+ %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
+ %res0 = insertvalue { <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+ %res1 = insertvalue { <8 x i8>, <8 x i8> } %res0, <8 x i8> %t2, 1
+ ret { <8 x i8>, <8 x i8> } %res1
+}
+
+; InterleavedAccess should kick in even if the users of deinterleave intrinsic is not extractvalue.
+define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_no_extract(ptr %p, ptr %p1, i1 %c) {
+; CHECK-LABEL: vector_deinterleave_load_factor3_no_extract:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a2, a2, 1
+; CHECK-NEXT: beqz a2, .LBB17_2
+; CHECK-NEXT: # %bb.1: # %bb0
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vlseg3e8.v v6, (a0)
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB17_2: # %bb1
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vlseg3e8.v v6, (a1)
+; CHECK-NEXT: ret
+ br i1 %c, label %bb0, label %bb1
+
+bb0:
+ %vec0 = load <24 x i8>, ptr %p
+ %d0.0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec0)
+ br label %merge
+
+bb1:
+ %vec1 = load <24 x i8>, ptr %p1
+ %d0.1 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec1)
+ br label %merge
+
+merge:
+ %d0 = phi {<8 x i8>, <8 x i8>, <8 x i8>} [%d0.0, %bb0], [%d0.1, %bb1]
+ %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0
+ %t1 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 1
+ %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
+ %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+ %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 0
+ %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 0
+ ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
+}
+
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4:
; CHECK: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 7fb822d20f892..0fa8e05acc8ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -66,6 +66,115 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor
ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
}
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_partial(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor3_partial:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: lui a2, 699051
+; RV32-NEXT: addi a2, a2, -1365
+; RV32-NEXT: mulhu a1, a1, a2
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg3e32.v v7, (a0)
+; RV32-NEXT: vmv1r.v v8, v7
+; RV32-NEXT: ret
+;
+; RV64-LABEL: load_factor3_partial:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a2, a1, 1
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: lui a2, 699051
+; RV64-NEXT: addi a2, a2, -1365
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: slli a2, a2, 32
+; RV64-NEXT: mulhu a1, a1, a2
+; RV64-NEXT: srli a1, a1, 33
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg3e32.v v7, (a0)
+; RV64-NEXT: vmv1r.v v8, v7
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 3
+ %wide.masked.load = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 %rvl)
+ %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vsc...
[truncated]
|
|
||
// Merge the values from different factors. | ||
for (unsigned J = 0; J < Factor; ++J) | ||
Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this extra intertvalue shouldn't affect codegen. @hassnaaHamdi do you think AArch64 also needs some codegen tests for InterleavedAccess?
/// A vector can either be deinterleaved through an intrinsic or a combination | ||
/// of shufflevector instructions. This is a thin abstraction layer to provide | ||
/// some common information like the deinterleaving factor. | ||
struct VectorDeinterleaving { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried to make this wrapper as simple as possible. I'm open to make it a union + tag or even std::variant
.
llvm/lib/Analysis/VectorUtils.cpp
Outdated
assert(isValid()); | ||
if (DI) | ||
return getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); | ||
else |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No else after return
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
llvm/lib/Analysis/VectorUtils.cpp
Outdated
assert(getFactor() > 0); | ||
if (DI) { | ||
return *DI->getType()->subtype_begin(); | ||
} else { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No else after return.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
ret { <8 x i8>, <8 x i8> } %res1 | ||
} | ||
|
||
; InterleavedAccess should kick in even if the users of deinterleave intrinsic is not extractvalue. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
; InterleavedAccess should kick in even if the users of deinterleave intrinsic is not extractvalue. | |
; InterleavedAccess should kick in even if the users of deinterleave intrinsic are not extractvalue. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1 | ||
} | ||
|
||
; InterleavedAccess should kick in even if the users of deinterleave intrinsic is not extractvalue. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
; InterleavedAccess should kick in even if the users of deinterleave intrinsic is not extractvalue. | |
; InterleavedAccess should kick in even if the users of deinterleave intrinsic are not extractvalue. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
Value *BaseAddr = LI->getPointerOperand(); | ||
Value *Result = nullptr; | ||
if (NumLoads > 1) { | ||
Result = PoisonValue::get(DI->getType()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move this next to the insertvalue loop?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had been playing with a somewhat related idea, but in a slightly different direction. I think that trying to handle the shuffle based and intrinsic based cases via the same API was a mistake. If we design an API for the intrinsic lowering path, we can pas the "load like" instruction, and the deinterleave intrinsic directly. The fact that some of the loads are LoadInst, and others are IntrinsicInst are implementation details. Doing it that way achieves a similar effect in that the extracts become irrelevant, we're directly replacing the pair of load + deinterleave.
My approach does require that we handle the "one active" case through some kind of combine after rewriting (since the API no longer knows which extracts are used.) I was planning to go explore that, but hadn't gotten there yet.
FYI, I posted #148892. This is mostly just to force us not to forget about the one-active case when considering API design. I'm still planning on trying this as a combine, but wanted the quick and easy version in place while I did. |
One of the original motivations to piggy-back shufflevector path on top of intrinsic lowering path was because the vp.load + shufflevector case needs to propagate the mask. In other words, the interfaces were separated by whether we need to propagate mask (i.e. vp.load) or not (i.e. load). At that time it seemed fine, but I agree that with the new feature I'm trying to add here this does look like a mistake. And we should probably separate the interface by shufflevector v.s. intrinsic lowering path (for the mask problem, just pass mask in both interfaces), which can also get rid of the slightly awkward |
I've also been thinking about supporting masked.load/store in this codepath. That's a case which has the mask, but not the EVL. |
I have rebased and used the latest TLI hook interfaced added in #148978 . This gets rid of the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM w/minor comments.
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID); | ||
|
||
/// Given a deinterleaveN intrinsic, return the (narrow) type of each factor. | ||
LLVM_ABI Type *getDeinterleavedVectorType(IntrinsicInst *DI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can have this return a VectorType, and remove a couple casts.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
Value *WideMask = VPLoad->getOperand(1); | ||
Mask = getMask(WideMask, Factor, cast<VectorType>(LastFactor->getType())); | ||
Mask = getMask(WideMask, Factor, | ||
cast<VectorType>(getDeinterleavedVectorType(DI))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
e.g. a removable cast
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
Value *FirstActive = *llvm::find_if(DeinterleavedValues, | ||
[](Value *V) { return V != nullptr; }); | ||
VectorType *VTy = cast<VectorType>(FirstActive->getType()); | ||
VectorType *VTy = cast<VectorType>(getDeinterleavedVectorType(DI)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
e.g. another
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
const unsigned Factor = DeinterleaveValues.size(); | ||
Instruction *Load, Value *Mask, IntrinsicInst *DI) const { | ||
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); | ||
assert(Factor && "unexpected deinterleaving factor"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Redundant assert, already checked in callee.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed
Value *FirstActive = | ||
*llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; }); | ||
VectorType *ResVTy = cast<VectorType>(FirstActive->getType()); | ||
VectorType *ResVTy = cast<VectorType>(getDeinterleavedVectorType(DI)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
e.g. another cast
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There are cases where InstCombine / InstSimplify might sink extractvalue instructions that use a deinterleave intrinsic into successor blocks, which prevents InterleavedAccess from kicking in because the current pattern requires deinterleave intrinsic to be used by extractvalue. However, this requirement is bit too strict while we could have just replaced the users of deinterleave intrinsic with whatever generated by the target TLI hooks.
I also need to do some special handling in AArch64's
TLI::lowerDeinterleaveIntrinsicToLoad
to merge the values representing individual factor before replacing the uses of deinterleave intrinsic with it.