@@ -3815,6 +3815,27 @@ InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
3815
3815
return LegalizationCost * LT.first ;
3816
3816
}
3817
3817
3818
+ // / Check if the mask is a DE-interleave mask of the given factor
3819
+ // / \p Factor like:
3820
+ // / <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
3821
+ static bool isDeInterleaveMaskOfFactor (ArrayRef<int > Mask, unsigned Factor) {
3822
+ // Check all potential start indices from 0 to (Factor - 1).
3823
+ for (unsigned Index = 0 ; Index < Factor; Index++) {
3824
+ unsigned i = 0 ;
3825
+
3826
+ // Check that elements are in ascending order by Factor. Ignore undef
3827
+ // elements.
3828
+ for (; i < Mask.size (); i++)
3829
+ if (Mask[i] >= 0 && static_cast <unsigned >(Mask[i]) != Index + i * Factor)
3830
+ break ;
3831
+
3832
+ if (i == Mask.size ())
3833
+ return true ;
3834
+ }
3835
+
3836
+ return false ;
3837
+ }
3838
+
3818
3839
InstructionCost AArch64TTIImpl::getShuffleCost (
3819
3840
TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int > Mask,
3820
3841
TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
@@ -3827,9 +3848,18 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
3827
3848
Tp->getScalarSizeInBits () == LT.second .getScalarSizeInBits () &&
3828
3849
Mask.size () > LT.second .getVectorNumElements () && !Index && !SubTp) {
3829
3850
3851
+ // Check for LD3/LD4 instructions, which are represented in llvm IR as
3852
+ // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
3853
+ // but we model it with a cost of LT.first so that LD3/LD4 have a higher
3854
+ // cost than just the load.
3855
+ if (Args.size () >= 1 && isa<LoadInst>(Args[0 ]) &&
3856
+ (isDeInterleaveMaskOfFactor (Mask, 3 ) ||
3857
+ isDeInterleaveMaskOfFactor (Mask, 4 )))
3858
+ return std::max<InstructionCost>(1 , LT.first / 4 );
3859
+
3830
3860
// Check for ST3/ST4 instructions, which are represented in llvm IR as
3831
3861
// store(interleaving-shuffle). The shuffle cost could potentially be free,
3832
- // but we model it with a cost of LT.first so that LD3/LD3 have a higher
3862
+ // but we model it with a cost of LT.first so that ST3/ST4 have a higher
3833
3863
// cost than just the store.
3834
3864
if (CxtI && CxtI->hasOneUse () && isa<StoreInst>(*CxtI->user_begin ()) &&
3835
3865
(ShuffleVectorInst::isInterleaveMask (
0 commit comments