-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[CostModel] Handle vector struct results and cost llvm.sincos
#123210
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
✅ With the latest revision this PR passed the C/C++ code formatter. |
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-amdgpu Author: Benjamin Maxwell (MacDue) ChangesThis teaches the loop vectorizer that The Depends on: #109833 Patch is 34.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123210.diff 10 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9048481b49189..00dbbc757f156 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -126,6 +126,7 @@ class IntrinsicCostAttributes {
// If ScalarizationCost is UINT_MAX, the cost of scalarizing the
// arguments and the return value will be computed based on types.
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
+ TargetLibraryInfo const *LibInfo = nullptr;
public:
IntrinsicCostAttributes(
@@ -145,7 +146,8 @@ class IntrinsicCostAttributes {
Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
const IntrinsicInst *I = nullptr,
- InstructionCost ScalarCost = InstructionCost::getInvalid());
+ InstructionCost ScalarCost = InstructionCost::getInvalid(),
+ TargetLibraryInfo const *LibInfo = nullptr);
Intrinsic::ID getID() const { return IID; }
const IntrinsicInst *getInst() const { return II; }
@@ -154,6 +156,7 @@ class IntrinsicCostAttributes {
InstructionCost getScalarizationCost() const { return ScalarizationCost; }
const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; }
+ const TargetLibraryInfo *getLibInfo() const { return LibInfo; }
bool isTypeBasedOnly() const {
return Arguments.empty();
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 032c7d7b5159e..d633b26811cf5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -22,6 +22,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/TargetTransformInfoImpl.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -1726,9 +1727,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
Type *RetTy = ICA.getReturnType();
- ElementCount RetVF =
- (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
- : ElementCount::getFixed(1));
+ ElementCount RetVF = isVectorizedTy(RetTy) ? getVectorizedTypeVF(RetTy)
+ : ElementCount::getFixed(1);
+
const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
@@ -1997,6 +1998,49 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
case Intrinsic::experimental_vector_match:
return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+ case Intrinsic::sincos: {
+ // Vector variants of llvm.sincos can be mapped to a vector library call.
+ auto const *LibInfo = ICA.getLibInfo();
+ if (!LibInfo || !isVectorizedTy(RetTy))
+ break;
+
+ // Find associated libcall.
+ VectorType *VectorTy = cast<VectorType>(getContainedTypes(RetTy).front());
+ EVT VT = getTLI()->getValueType(DL, VectorTy);
+ RTLIB::Libcall LC = RTLIB::getSINCOS(VT.getVectorElementType());
+ const char *LCName = getTLI()->getLibcallName(LC);
+ if (!LC || !LCName)
+ break;
+
+ // Search for a corresponding vector variant.
+ LLVMContext &Ctx = RetTy->getContext();
+ auto VF = getVectorizedTypeVF(RetTy);
+ VecDesc const *VD = nullptr;
+ for (bool Masked : {false, true}) {
+ if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
+ break;
+ }
+ if (!VD)
+ break;
+
+ // Cost the call + mask.
+ auto Cost = thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(),
+ CostKind);
+ if (VD->isMasked())
+ Cost += thisT()->getShuffleCost(
+ TargetTransformInfo::SK_Broadcast,
+ VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0,
+ nullptr, {});
+
+ // Lowering to a sincos library call (with output pointers) may require us
+ // to emit reloads for the results.
+ Cost +=
+ thisT()->getMemoryOpCost(
+ Instruction::Load, VectorTy,
+ thisT()->getDataLayout().getABITypeAlign(VectorTy), 0, CostKind) *
+ 2;
+ return Cost;
+ }
}
// Assume that we need to scalarize this intrinsic.)
@@ -2005,10 +2049,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
if (RetVF.isVector() && !RetVF.isScalable()) {
ScalarizationCost = 0;
- if (!RetTy->isVoidTy())
- ScalarizationCost += getScalarizationOverhead(
- cast<VectorType>(RetTy),
- /*Insert*/ true, /*Extract*/ false, CostKind);
+ if (!RetTy->isVoidTy()) {
+ for (Type *VectorTy : getContainedTypes(RetTy)) {
+ ScalarizationCost += getScalarizationOverhead(
+ cast<VectorType>(VectorTy),
+ /*Insert*/ true, /*Extract*/ false, CostKind);
+ }
+ }
ScalarizationCost +=
getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
}
@@ -2689,27 +2736,32 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// Else, assume that we need to scalarize this intrinsic. For math builtins
// this will emit a costly libcall, adding call overhead and spills. Make it
// very expensive.
- if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
+ if (isVectorizedTy(RetTy)) {
+ ArrayRef<Type *> RetVTys = getContainedTypes(RetTy);
+
// Scalable vectors cannot be scalarized, so return Invalid.
- if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
- return isa<ScalableVectorType>(Ty);
- }))
+ if (any_of(concat<Type *const>(RetVTys, Tys),
+ [](Type *Ty) { return isa<ScalableVectorType>(Ty); }))
return InstructionCost::getInvalid();
- InstructionCost ScalarizationCost =
- SkipScalarizationCost
- ? ScalarizationCostPassed
- : getScalarizationOverhead(RetVTy, /*Insert*/ true,
- /*Extract*/ false, CostKind);
+ InstructionCost ScalarizationCost = ScalarizationCostPassed;
+ if (!SkipScalarizationCost) {
+ ScalarizationCost = 0;
+ for (Type *RetVTy : RetVTys) {
+ ScalarizationCost += getScalarizationOverhead(
+ cast<VectorType>(RetVTy), /*Insert*/ true,
+ /*Extract*/ false, CostKind);
+ }
+ }
- unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
+ unsigned ScalarCalls = getVectorizedTypeVF(RetTy).getFixedValue();
SmallVector<Type *, 4> ScalarTys;
for (Type *Ty : Tys) {
if (Ty->isVectorTy())
Ty = Ty->getScalarType();
ScalarTys.push_back(Ty);
}
- IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
+ IntrinsicCostAttributes Attrs(IID, toScalarizedTy(RetTy), ScalarTys, FMF);
InstructionCost ScalarCost =
thisT()->getIntrinsicInstrCost(Attrs, CostKind);
for (Type *Ty : Tys) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1ca9a16b18112..ed041dc3c8bfb 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -101,13 +101,12 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
ParamTys.push_back(Argument->getType());
}
-IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
- ArrayRef<const Value *> Args,
- ArrayRef<Type *> Tys,
- FastMathFlags Flags,
- const IntrinsicInst *I,
- InstructionCost ScalarCost)
- : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
+IntrinsicCostAttributes::IntrinsicCostAttributes(
+ Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
+ ArrayRef<Type *> Tys, FastMathFlags Flags, const IntrinsicInst *I,
+ InstructionCost ScalarCost, TargetLibraryInfo const *LibInfo)
+ : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost),
+ LibInfo(LibInfo) {
ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
Arguments.insert(Arguments.begin(), Args.begin(), Args.end());
}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 53be7fc0bee9f..dcfd3d5a8bd6e 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -72,6 +72,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::atan2:
case Intrinsic::sin:
case Intrinsic::cos:
+ case Intrinsic::sincos:
case Intrinsic::tan:
case Intrinsic::sinh:
case Intrinsic::cosh:
@@ -185,6 +186,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
case Intrinsic::ucmp:
case Intrinsic::scmp:
return OpdIdx == -1 || OpdIdx == 0;
+ case Intrinsic::sincos:
case Intrinsic::is_fpclass:
case Intrinsic::vp_is_fpclass:
return OpdIdx == 0;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8c41f896ad622..f8e11c9c57e9b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2926,7 +2926,8 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
[&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
- dyn_cast<IntrinsicInst>(CI));
+ dyn_cast<IntrinsicInst>(CI),
+ InstructionCost::getInvalid(), TLI);
return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1bba667c206cf..d50767df0b5e8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1179,7 +1179,8 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
IntrinsicCostAttributes CostAttrs(
VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
- dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
+ dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
+ InstructionCost::getInvalid(), &Ctx.TLI);
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
index 22134d042fabb..f5f4445b34b02 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
@@ -68,46 +68,46 @@ define void @frexp_f16_i32() {
define void @frexp_f16_i16() {
; GFX7-LABEL: 'frexp_f16_i16'
; GFX7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
; GFX7-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; GFX8PLUS-LABEL: 'frexp_f16_i16'
; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; GFX7-SIZE-LABEL: 'frexp_f16_i16'
; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; GFX8PLUS-SIZE-LABEL: 'frexp_f16_i16'
; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instructi...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Benjamin Maxwell (MacDue) ChangesThis teaches the loop vectorizer that The Depends on: #109833 Patch is 34.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123210.diff 10 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9048481b49189..00dbbc757f156 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -126,6 +126,7 @@ class IntrinsicCostAttributes {
// If ScalarizationCost is UINT_MAX, the cost of scalarizing the
// arguments and the return value will be computed based on types.
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
+ TargetLibraryInfo const *LibInfo = nullptr;
public:
IntrinsicCostAttributes(
@@ -145,7 +146,8 @@ class IntrinsicCostAttributes {
Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
const IntrinsicInst *I = nullptr,
- InstructionCost ScalarCost = InstructionCost::getInvalid());
+ InstructionCost ScalarCost = InstructionCost::getInvalid(),
+ TargetLibraryInfo const *LibInfo = nullptr);
Intrinsic::ID getID() const { return IID; }
const IntrinsicInst *getInst() const { return II; }
@@ -154,6 +156,7 @@ class IntrinsicCostAttributes {
InstructionCost getScalarizationCost() const { return ScalarizationCost; }
const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; }
+ const TargetLibraryInfo *getLibInfo() const { return LibInfo; }
bool isTypeBasedOnly() const {
return Arguments.empty();
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 032c7d7b5159e..d633b26811cf5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -22,6 +22,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/TargetTransformInfoImpl.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -1726,9 +1727,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
Type *RetTy = ICA.getReturnType();
- ElementCount RetVF =
- (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
- : ElementCount::getFixed(1));
+ ElementCount RetVF = isVectorizedTy(RetTy) ? getVectorizedTypeVF(RetTy)
+ : ElementCount::getFixed(1);
+
const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
@@ -1997,6 +1998,49 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
case Intrinsic::experimental_vector_match:
return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+ case Intrinsic::sincos: {
+ // Vector variants of llvm.sincos can be mapped to a vector library call.
+ auto const *LibInfo = ICA.getLibInfo();
+ if (!LibInfo || !isVectorizedTy(RetTy))
+ break;
+
+ // Find associated libcall.
+ VectorType *VectorTy = cast<VectorType>(getContainedTypes(RetTy).front());
+ EVT VT = getTLI()->getValueType(DL, VectorTy);
+ RTLIB::Libcall LC = RTLIB::getSINCOS(VT.getVectorElementType());
+ const char *LCName = getTLI()->getLibcallName(LC);
+ if (!LC || !LCName)
+ break;
+
+ // Search for a corresponding vector variant.
+ LLVMContext &Ctx = RetTy->getContext();
+ auto VF = getVectorizedTypeVF(RetTy);
+ VecDesc const *VD = nullptr;
+ for (bool Masked : {false, true}) {
+ if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
+ break;
+ }
+ if (!VD)
+ break;
+
+ // Cost the call + mask.
+ auto Cost = thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(),
+ CostKind);
+ if (VD->isMasked())
+ Cost += thisT()->getShuffleCost(
+ TargetTransformInfo::SK_Broadcast,
+ VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0,
+ nullptr, {});
+
+ // Lowering to a sincos library call (with output pointers) may require us
+ // to emit reloads for the results.
+ Cost +=
+ thisT()->getMemoryOpCost(
+ Instruction::Load, VectorTy,
+ thisT()->getDataLayout().getABITypeAlign(VectorTy), 0, CostKind) *
+ 2;
+ return Cost;
+ }
}
// Assume that we need to scalarize this intrinsic.)
@@ -2005,10 +2049,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
if (RetVF.isVector() && !RetVF.isScalable()) {
ScalarizationCost = 0;
- if (!RetTy->isVoidTy())
- ScalarizationCost += getScalarizationOverhead(
- cast<VectorType>(RetTy),
- /*Insert*/ true, /*Extract*/ false, CostKind);
+ if (!RetTy->isVoidTy()) {
+ for (Type *VectorTy : getContainedTypes(RetTy)) {
+ ScalarizationCost += getScalarizationOverhead(
+ cast<VectorType>(VectorTy),
+ /*Insert*/ true, /*Extract*/ false, CostKind);
+ }
+ }
ScalarizationCost +=
getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
}
@@ -2689,27 +2736,32 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// Else, assume that we need to scalarize this intrinsic. For math builtins
// this will emit a costly libcall, adding call overhead and spills. Make it
// very expensive.
- if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
+ if (isVectorizedTy(RetTy)) {
+ ArrayRef<Type *> RetVTys = getContainedTypes(RetTy);
+
// Scalable vectors cannot be scalarized, so return Invalid.
- if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
- return isa<ScalableVectorType>(Ty);
- }))
+ if (any_of(concat<Type *const>(RetVTys, Tys),
+ [](Type *Ty) { return isa<ScalableVectorType>(Ty); }))
return InstructionCost::getInvalid();
- InstructionCost ScalarizationCost =
- SkipScalarizationCost
- ? ScalarizationCostPassed
- : getScalarizationOverhead(RetVTy, /*Insert*/ true,
- /*Extract*/ false, CostKind);
+ InstructionCost ScalarizationCost = ScalarizationCostPassed;
+ if (!SkipScalarizationCost) {
+ ScalarizationCost = 0;
+ for (Type *RetVTy : RetVTys) {
+ ScalarizationCost += getScalarizationOverhead(
+ cast<VectorType>(RetVTy), /*Insert*/ true,
+ /*Extract*/ false, CostKind);
+ }
+ }
- unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
+ unsigned ScalarCalls = getVectorizedTypeVF(RetTy).getFixedValue();
SmallVector<Type *, 4> ScalarTys;
for (Type *Ty : Tys) {
if (Ty->isVectorTy())
Ty = Ty->getScalarType();
ScalarTys.push_back(Ty);
}
- IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
+ IntrinsicCostAttributes Attrs(IID, toScalarizedTy(RetTy), ScalarTys, FMF);
InstructionCost ScalarCost =
thisT()->getIntrinsicInstrCost(Attrs, CostKind);
for (Type *Ty : Tys) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1ca9a16b18112..ed041dc3c8bfb 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -101,13 +101,12 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
ParamTys.push_back(Argument->getType());
}
-IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
- ArrayRef<const Value *> Args,
- ArrayRef<Type *> Tys,
- FastMathFlags Flags,
- const IntrinsicInst *I,
- InstructionCost ScalarCost)
- : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
+IntrinsicCostAttributes::IntrinsicCostAttributes(
+ Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
+ ArrayRef<Type *> Tys, FastMathFlags Flags, const IntrinsicInst *I,
+ InstructionCost ScalarCost, TargetLibraryInfo const *LibInfo)
+ : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost),
+ LibInfo(LibInfo) {
ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
Arguments.insert(Arguments.begin(), Args.begin(), Args.end());
}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 53be7fc0bee9f..dcfd3d5a8bd6e 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -72,6 +72,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::atan2:
case Intrinsic::sin:
case Intrinsic::cos:
+ case Intrinsic::sincos:
case Intrinsic::tan:
case Intrinsic::sinh:
case Intrinsic::cosh:
@@ -185,6 +186,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
case Intrinsic::ucmp:
case Intrinsic::scmp:
return OpdIdx == -1 || OpdIdx == 0;
+ case Intrinsic::sincos:
case Intrinsic::is_fpclass:
case Intrinsic::vp_is_fpclass:
return OpdIdx == 0;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8c41f896ad622..f8e11c9c57e9b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2926,7 +2926,8 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
[&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
- dyn_cast<IntrinsicInst>(CI));
+ dyn_cast<IntrinsicInst>(CI),
+ InstructionCost::getInvalid(), TLI);
return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1bba667c206cf..d50767df0b5e8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1179,7 +1179,8 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
IntrinsicCostAttributes CostAttrs(
VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
- dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
+ dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
+ InstructionCost::getInvalid(), &Ctx.TLI);
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
index 22134d042fabb..f5f4445b34b02 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
@@ -68,46 +68,46 @@ define void @frexp_f16_i32() {
define void @frexp_f16_i16() {
; GFX7-LABEL: 'frexp_f16_i16'
; GFX7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
; GFX7-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; GFX8PLUS-LABEL: 'frexp_f16_i16'
; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; GFX7-SIZE-LABEL: 'frexp_f16_i16'
; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; GFX8PLUS-SIZE-LABEL: 'frexp_f16_i16'
; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instructi...
[truncated]
|
if (!RetTy->isVoidTy()) { | ||
for (Type *VectorTy : getContainedTypes(RetTy)) { | ||
ScalarizationCost += getScalarizationOverhead( | ||
cast<VectorType>(VectorTy), | ||
/*Insert*/ true, /*Extract*/ false, CostKind); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@arsenm this is what causes the changes in the AMDGPU frexp.ll
costs. Previously, this would be skipped as RetVF
was 1 for struct-of-vector returns. Now it costs the inserts for both returned vectors.
%r = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %Val) | ||
%el = extractvalue { <4 x float>, <4 x float> } %r, 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note: I replaced this with an equivalent llvm.vector.deinterleave2
to test (which matches the intent of this test -- checking non-trivially-scalarizable struct-ret intrinsics are not scalarized).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This all looks sensible to me.
VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0, | ||
nullptr, {}); | ||
|
||
// Lowering to a library call (with output pointers) may require us to emit |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Always adding on the cost of a load seems pessimistic?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is slightly pessimistic, but the cost is still reasonably low at 12 or 13 (rather than 10 for a plain libcall), so the vectorizier still chooses to widen the intrinsic. It also means if libraries add structure-returning variants they'll preferred as they'll have a slightly lower cost.
3a8249b
to
e0f6106
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given you've got explicit cost model tests in llvm/test/Analysis/CostModel/AArch64/sincos.ll for the BasicTTIImpl changes I wonder if it's worth pulling the vectoriser changes out into a follow-on PR? That way you at least get to keep most of the changes upstream if the vectoriser bits get reverted for some reason.
This patch updates the cost model to cost intrinsics that return multiple values (in structs) correctly. Previously, the cost model only thought intrinsics that return `VectorType` need scalarizing, which meant it cost intrinsics that return multiple vectors (that need scalarizing) way too cheap (giving it the cost of a single function call). This patch also adds a custom cost for llvm.sincos when a vector function library is available, as certain VFs can be expanded (later in code gen) to a vector function, reducing the cost to a single call (+ the possible loads from the vector function returns values via output pointers).
llvm.sincos
IntrinsicCostAttributes ICA( | ||
II->getIntrinsicID(), *II, InstructionCost::getInvalid(), | ||
/*TypeBasedOnly=*/TypeBasedIntrinsicCost, &TLI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You don't know if this is a libcall or not here, I also don't understand why this is guarded by TypeBasedIntrinsicCost. Can we remove that one too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I only use the getIntrinsicInstrCost
code path if one of the flags (type-based-intrinsic-cost
or libcall-based-intrinsic-cost
) is set as I discovered some targets return a different cost depending on if you call getIntrinsicInstrCost
directly or via getInstructionCost
. This seemed to be due to them modifying the cost returned from getIntrinsicInstrCost
within their implementation of getInstructionCost
. I didn't want to change tests that depend on this (and it's not relevant for AArch64), so I added a flag.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I must admit though it does seem a little odd now because -type-based-intrinsic-cost
also implies libcall-based-intrinsic-cost since it causes TLI
to be passed in. How about renaming libcall-based-intrinsic-cost
to be prefer-intrinsic-cost
or use-intrinsic-cost
, since it's not really tied to the library calls?
This is just a suggestion, but if we had a use-intrinsic-cost
option we could actually make it an enum along the lines of:
enum IntrinsicCostType {
None,
NonTypeBased,
TypeBased,
};
that way you can collapse the two options into one and both TypeBased
and NonTypeBase
variants will use the TLI. However, I appreciate that would significantly increase the number of test files changed so it's possibly something for a follow-on?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've renamed the flag prefer-intrinsic-cost
for this PR. The enum works nicely 👍, but since it does increase the churn a bit, so I agree it's best to post that as a small follow-up PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should at least have firm plan on removing the flag or setting it to true, so users benefit from the new code by default.
It may not be an issue for AArch64, but people building for AArch64 also won't get any benefit unless they know to set this flag and there will be very little coverage of the code on larger projects.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to be clear this flag only affects the test cost-model print pass (so users = LLVM developers), not LLVM more generally. The loop vectorizer in #128035 always passes the TLI.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changing the default would not be much trouble though -- it just requires adding a flag to a few tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The problem I see with changing the default prefer-intrinsic-cost
to true is that some tests may genuinely want to exercise the getInstructionCost
path, in which case they have to live without using the TLI. I think it makes sense to follow this up in a separate PR, along with potentially changing it to use an enum instead of a boolean so that we can remove the type-based-intrinsic-cost
flag as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, I'm happy to address this in the follow-up PR (since there will be a little test churn in that patch anyway).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to be clear this flag only affects the test cost-model print pass (so users = LLVM developers), not LLVM more generally. The loop vectorizer in #128035 always passes the TLI.
Ok that's good, thanks. I think ideally we would get rid of the option as follow-up, as it seems confusing to have the cost model printer show something that's different to the cost that is actually used in the passes making the queries.
Would be good to get this in soon, as I have some downstream changes depending on it :) |
IntrinsicCostAttributes ICA( | ||
II->getIntrinsicID(), *II, InstructionCost::getInvalid(), | ||
/*TypeBasedOnly=*/TypeBasedIntrinsicCost, &TLI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I must admit though it does seem a little odd now because -type-based-intrinsic-cost
also implies libcall-based-intrinsic-cost since it causes TLI
to be passed in. How about renaming libcall-based-intrinsic-cost
to be prefer-intrinsic-cost
or use-intrinsic-cost
, since it's not really tied to the library calls?
This is just a suggestion, but if we had a use-intrinsic-cost
option we could actually make it an enum along the lines of:
enum IntrinsicCostType {
None,
NonTypeBased,
TypeBased,
};
that way you can collapse the two options into one and both TypeBased
and NonTypeBase
variants will use the TLI. However, I appreciate that would significantly increase the number of test files changed so it's possibly something for a follow-on?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, but please wait a day in case others have any objections!
Thanks for the reviews! I've got the final patch for |
Created a follow-up for the flags here: #128885. I've not removed them or changed the default yet, but feel free to discuss that on the new patch. |
This patch updates the cost model to cost intrinsics that return multiple values (in structs) correctly. Previously, the cost model only thought intrinsics that return
VectorType
need scalarizing, which meant it cost intrinsics that return multiple vectors (that need scalarizing) way too cheap (giving it the cost of a single function call).This patch also adds a custom cost for llvm.sincos when a vector function library is available, as certain VFs can be expanded (later in code gen) to a vector function, reducing the cost to a single call (+ the possible loads from the vector function returns values via output pointers).