[PPC] Implement `areInlineCompatible` #126562

mustartt · 2025-02-10T18:07:09Z

After the default implementation swap from #117493, where areInlineCompatible checks if the callee features are a subset of caller features. This is not a safe assumption in general on PPC. We fallback to check for strict feature set equality for now, and see what improvements we can make.

llvmbot · 2025-02-10T18:07:48Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-powerpc

Author: Henry Jiang (mustartt)

Changes

After the default implementation swap from #117493, where areInlineCompatible checks if the callee features are a subset of caller features. This is not a safe assumption in general on PPC. We fallback to check for strict feature set equality for now, and see what improvements we can make.

Full diff: https://github.com/llvm/llvm-project/pull/126562.diff

2 Files Affected:

(modified) llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp (+73-62)
(modified) llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h (+2)

diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index c308ec332e84434..885e4b3fb323093 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -25,26 +25,29 @@ using namespace llvm;
 #define DEBUG_TYPE "ppctti"
 
 static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost",
-cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden);
+                                 cl::desc("add masking cost for i1 vectors"),
+                                 cl::init(true), cl::Hidden);
 
-static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
-cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
+static cl::opt<bool>
+    DisablePPCConstHoist("disable-ppc-constant-hoisting",
+                         cl::desc("disable constant hoisting on PPC"),
+                         cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
-EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
-                cl::desc("Enable using coldcc calling conv for cold "
-                         "internal functions"));
+    EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
+                    cl::desc("Enable using coldcc calling conv for cold "
+                             "internal functions"));
 
 static cl::opt<bool>
-LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
-               cl::desc("Do not add instruction count to lsr cost model"));
+    LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
+                   cl::desc("Do not add instruction count to lsr cost model"));
 
 // The latency of mtctr is only justified if there are more than 4
 // comparisons that will be removed as a result.
-static cl::opt<unsigned>
-SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
-                      cl::desc("Loops with a constant trip count smaller than "
-                               "this value will not use the count register."));
+static cl::opt<unsigned> SmallCTRLoopThreshold(
+    "min-ctr-loop-threshold", cl::init(4), cl::Hidden,
+    cl::desc("Loops with a constant trip count smaller than "
+             "this value will not use the count register."));
 
 //===----------------------------------------------------------------------===//
 //
@@ -56,8 +59,9 @@ TargetTransformInfo::PopcntSupportKind
 PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
-    return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
-             TTI::PSK_SlowHardware : TTI::PSK_FastHardware;
+    return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow
+               ? TTI::PSK_SlowHardware
+               : TTI::PSK_FastHardware;
   return TTI::PSK_Software;
 }
 
@@ -290,14 +294,12 @@ InstructionCost PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
       return TTI::TCC_Free;
 
     if (RunFree) {
-      if (Imm.getBitWidth() <= 32 &&
-          (isShiftedMask_32(Imm.getZExtValue()) ||
-           isShiftedMask_32(~Imm.getZExtValue())))
+      if (Imm.getBitWidth() <= 32 && (isShiftedMask_32(Imm.getZExtValue()) ||
+                                      isShiftedMask_32(~Imm.getZExtValue())))
         return TTI::TCC_Free;
 
-      if (ST->isPPC64() &&
-          (isShiftedMask_64(Imm.getZExtValue()) ||
-           isShiftedMask_64(~Imm.getZExtValue())))
+      if (ST->isPPC64() && (isShiftedMask_64(Imm.getZExtValue()) ||
+                            isShiftedMask_64(~Imm.getZExtValue())))
         return TTI::TCC_Free;
     }
 
@@ -364,14 +366,15 @@ bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
             Call->getIntrinsicID() == Intrinsic::loop_decrement)
           return false;
 
-  SmallVector<BasicBlock*, 4> ExitingBlocks;
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
   // If there is an exit edge known to be frequently taken,
   // we should not transform this loop.
   for (auto &BB : ExitingBlocks) {
     Instruction *TI = BB->getTerminator();
-    if (!TI) continue;
+    if (!TI)
+      continue;
 
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
       uint64_t TrueWeight = 0, FalseWeight = 0;
@@ -382,15 +385,15 @@ bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
       // If the exit path is more frequent than the loop path,
       // we return here without further analysis for this loop.
       bool TrueIsExit = !L->contains(BI->getSuccessor(0));
-      if (( TrueIsExit && FalseWeight < TrueWeight) ||
+      if ((TrueIsExit && FalseWeight < TrueWeight) ||
           (!TrueIsExit && FalseWeight > TrueWeight))
         return false;
     }
   }
 
   LLVMContext &C = L->getHeader()->getContext();
-  HWLoopInfo.CountType = TM.isPPC64() ?
-    Type::getInt64Ty(C) : Type::getInt32Ty(C);
+  HWLoopInfo.CountType =
+      TM.isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
   return true;
 }
@@ -419,9 +422,7 @@ void PPCTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
 // Returning true results in coldcc being used for functions which are cold at
 // all call sites when the callers of the functions are not calling any other
 // non coldcc functions.
-bool PPCTTIImpl::useColdCCForColdCall(Function &F) {
-  return EnablePPCColdCC;
-}
+bool PPCTTIImpl::useColdCCForColdCall(Function &F) { return EnablePPCColdCC; }
 
 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
   // On the A2, always unroll aggressively.
@@ -439,13 +440,11 @@ PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   return Options;
 }
 
-bool PPCTTIImpl::enableInterleavedAccessVectorization() {
-  return true;
-}
+bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; }
 
 unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
-  assert(ClassID == GPRRC || ClassID == FPRRC ||
-         ClassID == VRRC || ClassID == VSXRC);
+  assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC ||
+         ClassID == VSXRC);
   if (ST->hasVSX()) {
     assert(ClassID == GPRRC || ClassID == VSXRC || ClassID == VRRC);
     return ClassID == VSXRC ? 64 : 32;
@@ -469,16 +468,20 @@ unsigned PPCTTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const {
     return GPRRC;
 }
 
-const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
+const char *PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
 
   switch (ClassID) {
-    default:
-      llvm_unreachable("unknown register class");
-      return "PPC::unknown register class";
-    case GPRRC:       return "PPC::GPRRC";
-    case FPRRC:       return "PPC::FPRRC";
-    case VRRC:        return "PPC::VRRC";
-    case VSXRC:       return "PPC::VSXRC";
+  default:
+    llvm_unreachable("unknown register class");
+    return "PPC::unknown register class";
+  case GPRRC:
+    return "PPC::GPRRC";
+  case FPRRC:
+    return "PPC::FPRRC";
+  case VRRC:
+    return "PPC::VRRC";
+  case VSXRC:
+    return "PPC::VSXRC";
   }
 }
 
@@ -509,9 +512,7 @@ unsigned PPCTTIImpl::getCacheLineSize() const {
   return 64;
 }
 
-unsigned PPCTTIImpl::getPrefetchDistance() const {
-  return 300;
-}
+unsigned PPCTTIImpl::getPrefetchDistance() const { return 300; }
 
 unsigned PPCTTIImpl::getMaxInterleaveFactor(ElementCount VF) {
   unsigned Directive = ST->getCPUDirective();
@@ -582,8 +583,7 @@ InstructionCost PPCTTIImpl::vectorCostAdjustmentFactor(unsigned Opcode,
 InstructionCost PPCTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
-    ArrayRef<const Value *> Args,
-    const Instruction *CxtI) {
+    ArrayRef<const Value *> Args, const Instruction *CxtI) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr);
@@ -592,12 +592,12 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost(
 
   // TODO: Handle more cost kinds.
   if (CostKind != TTI::TCK_RecipThroughput)
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
-                                         Op2Info, Args, CxtI);
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
+                                         Args, CxtI);
 
   // Fallback to the default implementation.
-  InstructionCost Cost = BaseT::getArithmeticInstrCost(
-      Opcode, Ty, CostKind, Op1Info, Op2Info);
+  InstructionCost Cost =
+      BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
   return Cost * CostFactor;
 }
 
@@ -753,8 +753,7 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   // because they require store and reload with the attendant
   // processor stall for load-hit-store.  Until VSX is available,
   // these need to be estimated as very costly.
-  if (ISD == ISD::EXTRACT_VECTOR_ELT ||
-      ISD == ISD::INSERT_VECTOR_ELT)
+  if (ISD == ISD::EXTRACT_VECTOR_ELT || ISD == ISD::INSERT_VECTOR_ELT)
     return LHSPenalty + Cost;
 
   return Cost;
@@ -771,7 +770,7 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   if (!CostFactor.isValid())
     return InstructionCost::getMax();
 
-  if (TLI->getValueType(DL, Src,  true) == MVT::Other)
+  if (TLI->getValueType(DL, Src, true) == MVT::Other)
     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                   CostKind);
   // Legalize the type.
@@ -787,11 +786,11 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 
   Cost *= CostFactor;
 
-  bool IsAltivecType = ST->hasAltivec() &&
-                       (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
-                        LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
-  bool IsVSXType = ST->hasVSX() &&
-                   (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
+  bool IsAltivecType =
+      ST->hasAltivec() && (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
+                           LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
+  bool IsVSXType =
+      ST->hasVSX() && (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
 
   // VSX has 32b/64b load instructions. Legalization can handle loading of
   // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
@@ -884,7 +883,7 @@ InstructionCost PPCTTIImpl::getInterleavedMemoryOpCost(
   // instruction). For each result vector, we need one shuffle per incoming
   // vector (except that the first shuffle can take two incoming vectors
   // because it does not need to take itself).
-  Cost += Factor*(LT.first-1);
+  Cost += Factor * (LT.first - 1);
 
   return Cost;
 }
@@ -895,6 +894,20 @@ PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
+bool PPCTTIImpl::areInlineCompatible(const Function *Caller,
+                                     const Function *Callee) const {
+  const TargetMachine &TM = getTLI()->getTargetMachine();
+
+  const FeatureBitset &CallerBits =
+      TM.getSubtargetImpl(*Caller)->getFeatureBits();
+  const FeatureBitset &CalleeBits =
+      TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+  // Check that targets features are exactly the same. We can revisit to see if
+  // we can improve this.
+  return CallerBits == CalleeBits;
+}
+
 bool PPCTTIImpl::areTypesABICompatible(const Function *Caller,
                                        const Function *Callee,
                                        const ArrayRef<Type *> &Types) const {
@@ -950,9 +963,7 @@ bool PPCTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
 }
 
-bool PPCTTIImpl::isNumRegsMajorCostOfLSR() {
-  return false;
-}
+bool PPCTTIImpl::isNumRegsMajorCostOfLSR() { return false; }
 
 bool PPCTTIImpl::shouldBuildRelLookupTables() const {
   const PPCTargetMachine &TM = ST->getTargetMachine();
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 3cb60d7a1785ae3..bf3ddad134e14c3 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -139,6 +139,8 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
+  bool areInlineCompatible(const Function *Caller,
+                           const Function *Callee) const;
   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
                              const ArrayRef<Type *> &Types) const;
   bool hasActiveVectorLength(unsigned Opcode, Type *DataType,

mandlebug

Thanks Henry. This is a good start. Can you add a test now? Ideally it includes an example that has attributes that would be problematic when inlined with the default target hook.

mustartt · 2025-02-11T16:44:43Z

Thanks Henry. This is a good start. Can you add a test now? Ideally it includes an example that has attributes that would be problematic when inlined with the default target hook.

Added test

mustartt · 2025-02-11T16:52:41Z

I think in general, there are a subset of target features where the having a subset should allow inlining: i.e. vector instructions that excludes floating points FMA, MMA, and other that can have noticeable side-effects in the program. But there definitely exists cases where we need to compare for strict equality, i.e.

allow-unaligned-fp-access      - CPU does not trap on unaligned FP access.

where we should not inline a function that does not allow unaligned fp access into a caller that does.

This is just one example, but there are probably a lot more.

daltenty

LGTM, this should be conservatively correct. We can relax the constrain further in follow on work.

llvm/test/Transforms/Inline/PowerPC/inline-target-attr.ll

hubert-reinterpretcast

LGTM; thanks!

llvmbot added the backend:PowerPC label Feb 10, 2025

Implement areInlineCompatible for PPC

db41712

mustartt force-pushed the ppc-update-are-inline-compat branch from 9026167 to db41712 Compare February 10, 2025 18:09

mandlebug self-requested a review February 10, 2025 18:18

mandlebug reviewed Feb 10, 2025

View reviewed changes

mandlebug requested review from RolandF77 and hubert-reinterpretcast February 10, 2025 18:59

add inline attr test

ad6127b

llvmbot added the llvm:transforms label Feb 10, 2025

remove test comments

1cb4c8f

mustartt requested a review from mandlebug February 19, 2025 21:14

daltenty requested a review from lei137 February 21, 2025 19:34

daltenty approved these changes Feb 21, 2025

View reviewed changes

hubert-reinterpretcast reviewed Feb 21, 2025

View reviewed changes

llvm/test/Transforms/Inline/PowerPC/inline-target-attr.ll Show resolved Hide resolved

add default testcase

ebe7d94

mustartt requested a review from hubert-reinterpretcast February 24, 2025 22:19

hubert-reinterpretcast approved these changes Feb 24, 2025

View reviewed changes

mustartt merged commit 6d0cfbc into llvm:main Feb 24, 2025
8 of 11 checks passed

sayantn mentioned this pull request Apr 13, 2025

Inlining + target_feature broken in powerpc64 rust-lang/rust#60637

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[PPC] Implement `areInlineCompatible` #126562

[PPC] Implement `areInlineCompatible` #126562

mustartt commented Feb 10, 2025

llvmbot commented Feb 10, 2025 •

edited

Loading

mandlebug left a comment

mustartt commented Feb 11, 2025

mustartt commented Feb 11, 2025 •

edited

Loading

daltenty left a comment

hubert-reinterpretcast left a comment

[PPC] Implement areInlineCompatible #126562

[PPC] Implement areInlineCompatible #126562

Conversation

mustartt commented Feb 10, 2025

llvmbot commented Feb 10, 2025 • edited Loading

mandlebug left a comment

Choose a reason for hiding this comment

mustartt commented Feb 11, 2025

mustartt commented Feb 11, 2025 • edited Loading

daltenty left a comment

Choose a reason for hiding this comment

hubert-reinterpretcast left a comment

Choose a reason for hiding this comment

[PPC] Implement `areInlineCompatible` #126562

[PPC] Implement `areInlineCompatible` #126562

llvmbot commented Feb 10, 2025 •

edited

Loading

mustartt commented Feb 11, 2025 •

edited

Loading