-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[PPC] Implement areInlineCompatible
#126562
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-powerpc Author: Henry Jiang (mustartt) ChangesAfter the default implementation swap from #117493, where Full diff: https://github.com/llvm/llvm-project/pull/126562.diff 2 Files Affected:
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index c308ec332e84434..885e4b3fb323093 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -25,26 +25,29 @@ using namespace llvm;
#define DEBUG_TYPE "ppctti"
static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost",
-cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden);
+ cl::desc("add masking cost for i1 vectors"),
+ cl::init(true), cl::Hidden);
-static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
-cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
+static cl::opt<bool>
+ DisablePPCConstHoist("disable-ppc-constant-hoisting",
+ cl::desc("disable constant hoisting on PPC"),
+ cl::init(false), cl::Hidden);
static cl::opt<bool>
-EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
- cl::desc("Enable using coldcc calling conv for cold "
- "internal functions"));
+ EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
+ cl::desc("Enable using coldcc calling conv for cold "
+ "internal functions"));
static cl::opt<bool>
-LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
- cl::desc("Do not add instruction count to lsr cost model"));
+ LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
+ cl::desc("Do not add instruction count to lsr cost model"));
// The latency of mtctr is only justified if there are more than 4
// comparisons that will be removed as a result.
-static cl::opt<unsigned>
-SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
- cl::desc("Loops with a constant trip count smaller than "
- "this value will not use the count register."));
+static cl::opt<unsigned> SmallCTRLoopThreshold(
+ "min-ctr-loop-threshold", cl::init(4), cl::Hidden,
+ cl::desc("Loops with a constant trip count smaller than "
+ "this value will not use the count register."));
//===----------------------------------------------------------------------===//
//
@@ -56,8 +59,9 @@ TargetTransformInfo::PopcntSupportKind
PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
- return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
- TTI::PSK_SlowHardware : TTI::PSK_FastHardware;
+ return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow
+ ? TTI::PSK_SlowHardware
+ : TTI::PSK_FastHardware;
return TTI::PSK_Software;
}
@@ -290,14 +294,12 @@ InstructionCost PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
return TTI::TCC_Free;
if (RunFree) {
- if (Imm.getBitWidth() <= 32 &&
- (isShiftedMask_32(Imm.getZExtValue()) ||
- isShiftedMask_32(~Imm.getZExtValue())))
+ if (Imm.getBitWidth() <= 32 && (isShiftedMask_32(Imm.getZExtValue()) ||
+ isShiftedMask_32(~Imm.getZExtValue())))
return TTI::TCC_Free;
- if (ST->isPPC64() &&
- (isShiftedMask_64(Imm.getZExtValue()) ||
- isShiftedMask_64(~Imm.getZExtValue())))
+ if (ST->isPPC64() && (isShiftedMask_64(Imm.getZExtValue()) ||
+ isShiftedMask_64(~Imm.getZExtValue())))
return TTI::TCC_Free;
}
@@ -364,14 +366,15 @@ bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
Call->getIntrinsicID() == Intrinsic::loop_decrement)
return false;
- SmallVector<BasicBlock*, 4> ExitingBlocks;
+ SmallVector<BasicBlock *, 4> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
// If there is an exit edge known to be frequently taken,
// we should not transform this loop.
for (auto &BB : ExitingBlocks) {
Instruction *TI = BB->getTerminator();
- if (!TI) continue;
+ if (!TI)
+ continue;
if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
uint64_t TrueWeight = 0, FalseWeight = 0;
@@ -382,15 +385,15 @@ bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
// If the exit path is more frequent than the loop path,
// we return here without further analysis for this loop.
bool TrueIsExit = !L->contains(BI->getSuccessor(0));
- if (( TrueIsExit && FalseWeight < TrueWeight) ||
+ if ((TrueIsExit && FalseWeight < TrueWeight) ||
(!TrueIsExit && FalseWeight > TrueWeight))
return false;
}
}
LLVMContext &C = L->getHeader()->getContext();
- HWLoopInfo.CountType = TM.isPPC64() ?
- Type::getInt64Ty(C) : Type::getInt32Ty(C);
+ HWLoopInfo.CountType =
+ TM.isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
return true;
}
@@ -419,9 +422,7 @@ void PPCTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
// Returning true results in coldcc being used for functions which are cold at
// all call sites when the callers of the functions are not calling any other
// non coldcc functions.
-bool PPCTTIImpl::useColdCCForColdCall(Function &F) {
- return EnablePPCColdCC;
-}
+bool PPCTTIImpl::useColdCCForColdCall(Function &F) { return EnablePPCColdCC; }
bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
// On the A2, always unroll aggressively.
@@ -439,13 +440,11 @@ PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
return Options;
}
-bool PPCTTIImpl::enableInterleavedAccessVectorization() {
- return true;
-}
+bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; }
unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
- assert(ClassID == GPRRC || ClassID == FPRRC ||
- ClassID == VRRC || ClassID == VSXRC);
+ assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC ||
+ ClassID == VSXRC);
if (ST->hasVSX()) {
assert(ClassID == GPRRC || ClassID == VSXRC || ClassID == VRRC);
return ClassID == VSXRC ? 64 : 32;
@@ -469,16 +468,20 @@ unsigned PPCTTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const {
return GPRRC;
}
-const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
+const char *PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
switch (ClassID) {
- default:
- llvm_unreachable("unknown register class");
- return "PPC::unknown register class";
- case GPRRC: return "PPC::GPRRC";
- case FPRRC: return "PPC::FPRRC";
- case VRRC: return "PPC::VRRC";
- case VSXRC: return "PPC::VSXRC";
+ default:
+ llvm_unreachable("unknown register class");
+ return "PPC::unknown register class";
+ case GPRRC:
+ return "PPC::GPRRC";
+ case FPRRC:
+ return "PPC::FPRRC";
+ case VRRC:
+ return "PPC::VRRC";
+ case VSXRC:
+ return "PPC::VSXRC";
}
}
@@ -509,9 +512,7 @@ unsigned PPCTTIImpl::getCacheLineSize() const {
return 64;
}
-unsigned PPCTTIImpl::getPrefetchDistance() const {
- return 300;
-}
+unsigned PPCTTIImpl::getPrefetchDistance() const { return 300; }
unsigned PPCTTIImpl::getMaxInterleaveFactor(ElementCount VF) {
unsigned Directive = ST->getCPUDirective();
@@ -582,8 +583,7 @@ InstructionCost PPCTTIImpl::vectorCostAdjustmentFactor(unsigned Opcode,
InstructionCost PPCTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
- ArrayRef<const Value *> Args,
- const Instruction *CxtI) {
+ ArrayRef<const Value *> Args, const Instruction *CxtI) {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr);
@@ -592,12 +592,12 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost(
// TODO: Handle more cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
- return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
- Op2Info, Args, CxtI);
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
+ Args, CxtI);
// Fallback to the default implementation.
- InstructionCost Cost = BaseT::getArithmeticInstrCost(
- Opcode, Ty, CostKind, Op1Info, Op2Info);
+ InstructionCost Cost =
+ BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
return Cost * CostFactor;
}
@@ -753,8 +753,7 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
// because they require store and reload with the attendant
// processor stall for load-hit-store. Until VSX is available,
// these need to be estimated as very costly.
- if (ISD == ISD::EXTRACT_VECTOR_ELT ||
- ISD == ISD::INSERT_VECTOR_ELT)
+ if (ISD == ISD::EXTRACT_VECTOR_ELT || ISD == ISD::INSERT_VECTOR_ELT)
return LHSPenalty + Cost;
return Cost;
@@ -771,7 +770,7 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
if (!CostFactor.isValid())
return InstructionCost::getMax();
- if (TLI->getValueType(DL, Src, true) == MVT::Other)
+ if (TLI->getValueType(DL, Src, true) == MVT::Other)
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
CostKind);
// Legalize the type.
@@ -787,11 +786,11 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
Cost *= CostFactor;
- bool IsAltivecType = ST->hasAltivec() &&
- (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
- LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
- bool IsVSXType = ST->hasVSX() &&
- (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
+ bool IsAltivecType =
+ ST->hasAltivec() && (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
+ LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
+ bool IsVSXType =
+ ST->hasVSX() && (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
// VSX has 32b/64b load instructions. Legalization can handle loading of
// 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
@@ -884,7 +883,7 @@ InstructionCost PPCTTIImpl::getInterleavedMemoryOpCost(
// instruction). For each result vector, we need one shuffle per incoming
// vector (except that the first shuffle can take two incoming vectors
// because it does not need to take itself).
- Cost += Factor*(LT.first-1);
+ Cost += Factor * (LT.first - 1);
return Cost;
}
@@ -895,6 +894,20 @@ PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
+bool PPCTTIImpl::areInlineCompatible(const Function *Caller,
+ const Function *Callee) const {
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ const FeatureBitset &CallerBits =
+ TM.getSubtargetImpl(*Caller)->getFeatureBits();
+ const FeatureBitset &CalleeBits =
+ TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+ // Check that targets features are exactly the same. We can revisit to see if
+ // we can improve this.
+ return CallerBits == CalleeBits;
+}
+
bool PPCTTIImpl::areTypesABICompatible(const Function *Caller,
const Function *Callee,
const ArrayRef<Type *> &Types) const {
@@ -950,9 +963,7 @@ bool PPCTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
}
-bool PPCTTIImpl::isNumRegsMajorCostOfLSR() {
- return false;
-}
+bool PPCTTIImpl::isNumRegsMajorCostOfLSR() { return false; }
bool PPCTTIImpl::shouldBuildRelLookupTables() const {
const PPCTargetMachine &TM = ST->getTargetMachine();
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 3cb60d7a1785ae3..bf3ddad134e14c3 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -139,6 +139,8 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
bool UseMaskForCond = false, bool UseMaskForGaps = false);
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);
+ bool areInlineCompatible(const Function *Caller,
+ const Function *Callee) const;
bool areTypesABICompatible(const Function *Caller, const Function *Callee,
const ArrayRef<Type *> &Types) const;
bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
|
9026167
to
db41712
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks Henry. This is a good start. Can you add a test now? Ideally it includes an example that has attributes that would be problematic when inlined with the default target hook.
Added test |
I think in general, there are a subset of target features where the having a subset should allow inlining: i.e. vector instructions that excludes floating points FMA, MMA, and other that can have noticeable side-effects in the program. But there definitely exists cases where we need to compare for strict equality, i.e.
where we should not inline a function that does not allow unaligned fp access into a caller that does. This is just one example, but there are probably a lot more. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, this should be conservatively correct. We can relax the constrain further in follow on work.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM; thanks!
After the default implementation swap from #117493, where
areInlineCompatible
checks if the callee features are a subset of caller features. This is not a safe assumption in general on PPC. We fallback to check for strict feature set equality for now, and see what improvements we can make.