Skip to content

Commit f837539

Browse files
davemgreenllvmbot
authored andcommitted
[AArch64] Add invalid 1 x vscale costs for reductions and reduction-operations. (#102105)
The code-generator is currently not able to handle scalable vectors of <vscale x 1 x eltty>. The usual "fix" for this until it is supported is to mark the costs of loads/stores with an invalid cost, preventing the vectorizer from vectorizing at those factors. But on rare occasions loops do not contain load/stores, only reductions. So whilst this is still unsupported return an invalid cost to avoid selecting vscale x 1 VFs. The cost of a reduction is not currently used by the vectorizer so this adds the cost to the add/mul/and/or/xor or min/max that should feed the reduction. It includes reduction costs too, for completeness. This change will be removed when code-generation for these types is sufficiently reliable. Fixes #99760 (cherry picked from commit 0b745a1)
1 parent d033ae1 commit f837539

File tree

6 files changed

+107
-0
lines changed

6 files changed

+107
-0
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,15 @@ static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
540540
InstructionCost
541541
AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
542542
TTI::TargetCostKind CostKind) {
543+
// The code-generator is currently not able to handle scalable vectors
544+
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
545+
// it. This change will be removed when code-generation for these types is
546+
// sufficiently reliable.
543547
auto *RetTy = ICA.getReturnType();
548+
if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
549+
if (VTy->getElementCount() == ElementCount::getScalable(1))
550+
return InstructionCost::getInvalid();
551+
544552
switch (ICA.getID()) {
545553
case Intrinsic::experimental_vector_histogram_add:
546554
if (!ST->hasSVE2())
@@ -3018,6 +3026,14 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
30183026
ArrayRef<const Value *> Args,
30193027
const Instruction *CxtI) {
30203028

3029+
// The code-generator is currently not able to handle scalable vectors
3030+
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3031+
// it. This change will be removed when code-generation for these types is
3032+
// sufficiently reliable.
3033+
if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3034+
if (VTy->getElementCount() == ElementCount::getScalable(1))
3035+
return InstructionCost::getInvalid();
3036+
30213037
// TODO: Handle more cost kinds.
30223038
if (CostKind != TTI::TCK_RecipThroughput)
30233039
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
@@ -3792,6 +3808,14 @@ InstructionCost
37923808
AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
37933809
FastMathFlags FMF,
37943810
TTI::TargetCostKind CostKind) {
3811+
// The code-generator is currently not able to handle scalable vectors
3812+
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3813+
// it. This change will be removed when code-generation for these types is
3814+
// sufficiently reliable.
3815+
if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3816+
if (VTy->getElementCount() == ElementCount::getScalable(1))
3817+
return InstructionCost::getInvalid();
3818+
37953819
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
37963820

37973821
if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
@@ -3836,6 +3860,14 @@ InstructionCost
38363860
AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
38373861
std::optional<FastMathFlags> FMF,
38383862
TTI::TargetCostKind CostKind) {
3863+
// The code-generator is currently not able to handle scalable vectors
3864+
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3865+
// it. This change will be removed when code-generation for these types is
3866+
// sufficiently reliable.
3867+
if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
3868+
if (VTy->getElementCount() == ElementCount::getScalable(1))
3869+
return InstructionCost::getInvalid();
3870+
38393871
if (TTI::requiresOrderedReduction(FMF)) {
38403872
if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
38413873
InstructionCost BaseCost =

llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ define void @fadd() {
88
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fadd <vscale x 4 x half> undef, undef
99
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fadd <vscale x 8 x half> undef, undef
1010
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fadd <vscale x 16 x half> undef, undef
11+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V1F32 = fadd <vscale x 1 x float> undef, undef
1112
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fadd <vscale x 2 x float> undef, undef
1213
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fadd <vscale x 4 x float> undef, undef
1314
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fadd <vscale x 8 x float> undef, undef
@@ -19,6 +20,7 @@ define void @fadd() {
1920
%V8F16 = fadd <vscale x 8 x half> undef, undef
2021
%V16F16 = fadd <vscale x 16 x half> undef, undef
2122

23+
%V1F32 = fadd <vscale x 1 x float> undef, undef
2224
%V2F32 = fadd <vscale x 2 x float> undef, undef
2325
%V4F32 = fadd <vscale x 4 x float> undef, undef
2426
%V8F32 = fadd <vscale x 8 x float> undef, undef
@@ -34,6 +36,7 @@ define void @fsub() {
3436
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fsub <vscale x 4 x half> undef, undef
3537
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fsub <vscale x 8 x half> undef, undef
3638
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fsub <vscale x 16 x half> undef, undef
39+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V1F32 = fsub <vscale x 1 x float> undef, undef
3740
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fsub <vscale x 2 x float> undef, undef
3841
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fsub <vscale x 4 x float> undef, undef
3942
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fsub <vscale x 8 x float> undef, undef
@@ -45,6 +48,7 @@ define void @fsub() {
4548
%V8F16 = fsub <vscale x 8 x half> undef, undef
4649
%V16F16 = fsub <vscale x 16 x half> undef, undef
4750

51+
%V1F32 = fsub <vscale x 1 x float> undef, undef
4852
%V2F32 = fsub <vscale x 2 x float> undef, undef
4953
%V4F32 = fsub <vscale x 4 x float> undef, undef
5054
%V8F32 = fsub <vscale x 8 x float> undef, undef

llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
define void @foo_no_vscale_range() {
55
; CHECK-LABEL: 'foo_no_vscale_range'
6+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1(<vscale x 1 x i1> undef, i1 true)
67
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
78
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
89
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
@@ -45,6 +46,7 @@ define void @foo_no_vscale_range() {
4546
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
4647
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
4748
;
49+
%res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1(<vscale x 1 x i1> undef, i1 true)
4850
%res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
4951
%res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
5052
%res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)

llvm/test/Analysis/CostModel/AArch64/sve-arith.ll

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,34 @@ define void @scalable_mul() #0 {
4343
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv8i16 = mul <vscale x 8 x i16> undef, undef
4444
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv4i32 = mul <vscale x 4 x i32> undef, undef
4545
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv2i64 = mul <vscale x 2 x i64> undef, undef
46+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv1i64 = mul <vscale x 1 x i64> undef, undef
4647
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
4748
;
4849
entry:
4950
%mul_nxv16i8 = mul <vscale x 16 x i8> undef, undef
5051
%mul_nxv8i16 = mul <vscale x 8 x i16> undef, undef
5152
%mul_nxv4i32 = mul <vscale x 4 x i32> undef, undef
5253
%mul_nxv2i64 = mul <vscale x 2 x i64> undef, undef
54+
%mul_nxv1i64 = mul <vscale x 1 x i64> undef, undef
55+
56+
ret void
57+
}
58+
59+
define void @scalable_add() #0 {
60+
; CHECK-LABEL: 'scalable_add'
61+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_nxv16i8 = add <vscale x 16 x i8> undef, undef
62+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_nxv8i16 = add <vscale x 8 x i16> undef, undef
63+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_nxv4i32 = add <vscale x 4 x i32> undef, undef
64+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_nxv2i64 = add <vscale x 2 x i64> undef, undef
65+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %add_nxv1i64 = add <vscale x 1 x i64> undef, undef
66+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
67+
;
68+
entry:
69+
%add_nxv16i8 = add <vscale x 16 x i8> undef, undef
70+
%add_nxv8i16 = add <vscale x 8 x i16> undef, undef
71+
%add_nxv4i32 = add <vscale x 4 x i32> undef, undef
72+
%add_nxv2i64 = add <vscale x 2 x i64> undef, undef
73+
%add_nxv1i64 = add <vscale x 1 x i64> undef, undef
5374

5475
ret void
5576
}

0 commit comments

Comments
 (0)