Skip to content

Commit fcca24a

Browse files
committed
[AArch64] Add costs for LD3/LD4 shuffles.
Similar to llvm#87934, this adds costs to the shuffles in a canonical LD3/LD4 pattern, which are represented in LLVM as deinterleaving-shuffle(load). This likely has less effect at the moment than the ST3/ST4 costs as instcombine will perform certain transforms without considering the cost.
1 parent aac695d commit fcca24a

File tree

3 files changed

+149
-119
lines changed

3 files changed

+149
-119
lines changed

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -1376,7 +1376,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
13761376

13771377
return TargetTTI->getShuffleCost(
13781378
IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, VecTy,
1379-
AdjustMask, CostKind, 0, nullptr, {}, Shuffle);
1379+
AdjustMask, CostKind, 0, nullptr, Operands, Shuffle);
13801380
}
13811381

13821382
// Narrowing shuffle - perform shuffle at original wider width and
@@ -1385,7 +1385,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
13851385

13861386
InstructionCost ShuffleCost = TargetTTI->getShuffleCost(
13871387
IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc,
1388-
VecSrcTy, AdjustMask, CostKind, 0, nullptr, {}, Shuffle);
1388+
VecSrcTy, AdjustMask, CostKind, 0, nullptr, Operands, Shuffle);
13891389

13901390
SmallVector<int, 16> ExtractMask(Mask.size());
13911391
std::iota(ExtractMask.begin(), ExtractMask.end(), 0);

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+31-1
Original file line numberDiff line numberDiff line change
@@ -3815,6 +3815,27 @@ InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
38153815
return LegalizationCost * LT.first;
38163816
}
38173817

3818+
/// Check if the mask is a DE-interleave mask of the given factor
3819+
/// \p Factor like:
3820+
/// <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
3821+
static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor) {
3822+
// Check all potential start indices from 0 to (Factor - 1).
3823+
for (unsigned Index = 0; Index < Factor; Index++) {
3824+
unsigned i = 0;
3825+
3826+
// Check that elements are in ascending order by Factor. Ignore undef
3827+
// elements.
3828+
for (; i < Mask.size(); i++)
3829+
if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
3830+
break;
3831+
3832+
if (i == Mask.size())
3833+
return true;
3834+
}
3835+
3836+
return false;
3837+
}
3838+
38183839
InstructionCost AArch64TTIImpl::getShuffleCost(
38193840
TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
38203841
TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
@@ -3827,9 +3848,18 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
38273848
Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
38283849
Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
38293850

3851+
// Check for LD3/LD4 instructions, which are represented in llvm IR as
3852+
// deinterleaving-shuffle(load). The shuffle cost could potentially be free,
3853+
// but we model it with a cost of LT.first so that LD3/LD4 have a higher
3854+
// cost than just the load.
3855+
if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
3856+
(isDeInterleaveMaskOfFactor(Mask, 3) ||
3857+
isDeInterleaveMaskOfFactor(Mask, 4)))
3858+
return std::max<InstructionCost>(1, LT.first / 4);
3859+
38303860
// Check for ST3/ST4 instructions, which are represented in llvm IR as
38313861
// store(interleaving-shuffle). The shuffle cost could potentially be free,
3832-
// but we model it with a cost of LT.first so that LD3/LD3 have a higher
3862+
// but we model it with a cost of LT.first so that ST3/ST4 have a higher
38333863
// cost than just the store.
38343864
if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
38353865
(ShuffleVectorInst::isInterleaveMask(

0 commit comments

Comments
 (0)