[FuncSpec] Only compute Latency bonus when necessary #113159

hazzlim · 2024-10-21T12:39:48Z

Only compute the Latency component of a specialisation's Bonus when necessary, to avoid unnecessarily computing the Block Frequency Information for a Function.

llvmbot · 2024-10-21T12:40:23Z

@llvm/pr-subscribers-llvm-transforms

Author: Hari Limaye (hazzlim)

Changes

Only compute the Latency component of a specialisation's Bonus when necessary, to avoid unnecessarily computing the Block Frequency Information for a Function.

Patch is 21.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113159.diff

4 Files Affected:

(modified) llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h (+13-10)
(modified) llvm/lib/Transforms/IPO/FunctionSpecialization.cpp (+51-34)
(modified) llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll (+9-9)
(modified) llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp (+109-68)

diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
index b001771951e0fe..a50eafe48293ad 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
@@ -173,8 +173,9 @@ struct Bonus {
 };
 
 class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
+  std::function<BlockFrequencyInfo &(Function &)> GetBFI;
+  Function *F;
   const DataLayout &DL;
-  BlockFrequencyInfo &BFI;
   TargetTransformInfo &TTI;
   SCCPSolver &Solver;
 
@@ -192,17 +193,20 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   ConstMap::iterator LastVisited;
 
 public:
-  InstCostVisitor(const DataLayout &DL, BlockFrequencyInfo &BFI,
-                  TargetTransformInfo &TTI, SCCPSolver &Solver)
-      : DL(DL), BFI(BFI), TTI(TTI), Solver(Solver) {}
+  InstCostVisitor(std::function<BlockFrequencyInfo &(Function &)> GetBFI,
+                  Function *F, const DataLayout &DL, TargetTransformInfo &TTI,
+                  SCCPSolver &Solver)
+      : GetBFI(GetBFI), F(F), DL(DL), TTI(TTI), Solver(Solver) {}
 
   bool isBlockExecutable(BasicBlock *BB) {
     return Solver.isBlockExecutable(BB) && !DeadBlocks.contains(BB);
   }
 
-  Bonus getSpecializationBonus(Argument *A, Constant *C);
+  Cost getCodeSizeBonus(Argument *A, Constant *C);
+
+  Cost getCodeSizeBonusFromPendingPHIs();
 
-  Bonus getBonusFromPendingPHIs();
+  Cost getLatencyBonus();
 
 private:
   friend class InstVisitor<InstCostVisitor, Constant *>;
@@ -210,8 +214,8 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   static bool canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
                                     DenseSet<BasicBlock *> &DeadBlocks);
 
-  Bonus getUserBonus(Instruction *User, Value *Use = nullptr,
-                     Constant *C = nullptr);
+  Cost getUserCodeSizeBonus(Instruction *User, Value *Use = nullptr,
+                            Constant *C = nullptr);
 
   Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList);
   Cost estimateSwitchInst(SwitchInst &I);
@@ -283,9 +287,8 @@ class FunctionSpecializer {
   bool run();
 
   InstCostVisitor getInstCostVisitorFor(Function *F) {
-    auto &BFI = GetBFI(*F);
     auto &TTI = GetTTI(*F);
-    return InstCostVisitor(M.getDataLayout(), BFI, TTI, Solver);
+    return InstCostVisitor(GetBFI, F, M.getDataLayout(), TTI, Solver);
   }
 
 private:
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 7feebbe420ae53..38c10c29cf4e3a 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -112,7 +112,7 @@ bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
 Cost InstCostVisitor::estimateBasicBlocks(
                           SmallVectorImpl<BasicBlock *> &WorkList) {
   Cost CodeSize = 0;
-  // Accumulate the instruction cost of each basic block weighted by frequency.
+  // Accumulate the codesize savings of each basic block.
   while (!WorkList.empty()) {
     BasicBlock *BB = WorkList.pop_back_val();
 
@@ -154,37 +154,55 @@ static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) {
   return KnownConstants.lookup(V);
 }
 
-Bonus InstCostVisitor::getBonusFromPendingPHIs() {
-  Bonus B;
+Cost InstCostVisitor::getCodeSizeBonusFromPendingPHIs() {
+  Cost CodeSize;
   while (!PendingPHIs.empty()) {
     Instruction *Phi = PendingPHIs.pop_back_val();
     // The pending PHIs could have been proven dead by now.
     if (isBlockExecutable(Phi->getParent()))
-      B += getUserBonus(Phi);
+      CodeSize += getUserCodeSizeBonus(Phi);
   }
-  return B;
+  return CodeSize;
 }
 
-/// Compute a bonus for replacing argument \p A with constant \p C.
-Bonus InstCostVisitor::getSpecializationBonus(Argument *A, Constant *C) {
+/// Compute the codesize savings for replacing argument \p A with constant \p C.
+Cost InstCostVisitor::getCodeSizeBonus(Argument *A, Constant *C) {
   LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
                     << C->getNameOrAsOperand() << "\n");
-  Bonus B;
+  Cost CodeSize;
   for (auto *U : A->users())
     if (auto *UI = dyn_cast<Instruction>(U))
       if (isBlockExecutable(UI->getParent()))
-        B += getUserBonus(UI, A, C);
+        CodeSize += getUserCodeSizeBonus(UI, A, C);
 
   LLVM_DEBUG(dbgs() << "FnSpecialization:   Accumulated bonus {CodeSize = "
-                    << B.CodeSize << ", Latency = " << B.Latency
-                    << "} for argument " << *A << "\n");
-  return B;
+                    << CodeSize << "} for argument " << *A << "\n");
+  return CodeSize;
+}
+
+Cost InstCostVisitor::getLatencyBonus() {
+  auto &BFI = GetBFI(*F);
+  Cost Latency = 0;
+
+  for (auto Pair : KnownConstants) {
+    Instruction *I = dyn_cast<Instruction>(Pair.first);
+    if (!I)
+      continue;
+
+    uint64_t Weight = BFI.getBlockFreq(I->getParent()).getFrequency() /
+                      BFI.getEntryFreq().getFrequency();
+    Latency +=
+        Weight * TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+  }
+
+  return Latency;
 }
 
-Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
+Cost InstCostVisitor::getUserCodeSizeBonus(Instruction *User, Value *Use,
+                                           Constant *C) {
   // We have already propagated a constant for this user.
   if (KnownConstants.contains(User))
-    return {0, 0};
+    return 0;
 
   // Cache the iterator before visiting.
   LastVisited = Use ? KnownConstants.insert({Use, C}).first
@@ -198,7 +216,7 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C)
   } else {
     C = visit(*User);
     if (!C)
-      return {0, 0};
+      return 0;
   }
 
   // Even though it doesn't make sense to bind switch and branch instructions
@@ -208,23 +226,15 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C)
 
   CodeSize += TTI.getInstructionCost(User, TargetTransformInfo::TCK_CodeSize);
 
-  uint64_t Weight = BFI.getBlockFreq(User->getParent()).getFrequency() /
-                    BFI.getEntryFreq().getFrequency();
-
-  Cost Latency = Weight *
-      TTI.getInstructionCost(User, TargetTransformInfo::TCK_Latency);
-
   LLVM_DEBUG(dbgs() << "FnSpecialization:     {CodeSize = " << CodeSize
-                    << ", Latency = " << Latency << "} for user "
-                    << *User << "\n");
+                    << "} for user " << *User << "\n");
 
-  Bonus B(CodeSize, Latency);
   for (auto *U : User->users())
     if (auto *UI = dyn_cast<Instruction>(U))
       if (UI != User && isBlockExecutable(UI->getParent()))
-        B += getUserBonus(UI, User, C);
+        CodeSize += getUserCodeSizeBonus(UI, User, C);
 
-  return B;
+  return CodeSize;
 }
 
 Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
@@ -875,24 +885,23 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
       AllSpecs[Index].CallSites.push_back(&CS);
     } else {
       // Calculate the specialisation gain.
-      Bonus B;
+      Cost CodeSize;
       unsigned Score = 0;
       InstCostVisitor Visitor = getInstCostVisitorFor(F);
       for (ArgInfo &A : S.Args) {
-        B += Visitor.getSpecializationBonus(A.Formal, A.Actual);
+        CodeSize += Visitor.getCodeSizeBonus(A.Formal, A.Actual);
         Score += getInliningBonus(A.Formal, A.Actual);
       }
-      B += Visitor.getBonusFromPendingPHIs();
-
+      CodeSize += Visitor.getCodeSizeBonusFromPendingPHIs();
 
       LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
-                        << B.CodeSize << ", Latency = " << B.Latency
-                        << ", Inlining = " << Score << "}\n");
+                        << CodeSize << ", Inlining = " << Score << "}\n");
 
+      Bonus B = {CodeSize, 0};
       FunctionGrowth[F] += FuncSize - B.CodeSize;
 
       auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize,
-                             unsigned FuncGrowth) -> bool {
+                             unsigned FuncGrowth, InstCostVisitor &V) -> bool {
         // No check required.
         if (ForceSpecialization)
           return true;
@@ -902,6 +911,14 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
         // Minimum codesize savings.
         if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100)
           return false;
+
+        // Lazily compute the Latency, to avoid unnecessarily computing BFI.
+        B += {0, V.getLatencyBonus()};
+
+        LLVM_DEBUG(
+            dbgs() << "FnSpecialization: Specialization bonus {Latency = "
+                   << B.Latency << "}\n");
+
         // Minimum latency savings.
         if (B.Latency < MinLatencySavings * FuncSize / 100)
           return false;
@@ -912,7 +929,7 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
       };
 
       // Discard unprofitable specialisations.
-      if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F]))
+      if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F], Visitor))
         continue;
 
       // Create a new specialisation entry.
diff --git a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
index ff57569d127884..f8c8e33dfc2335 100644
--- a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
+++ b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
@@ -4,25 +4,25 @@
 
 ; This test case is trying to validate that the postdomtree is preserved
 ; correctly by the ipsccp pass. A tricky bug was introduced in commit
-; 1b1232047e83b69561 when PDT would be feched using getCachedAnalysis in order
+; 1b1232047e83b69561 when PDT would be fetched using getCachedAnalysis in order
 ; to setup a DomTreeUpdater (to update the PDT during transformation in order
 ; to preserve the analysis). But given that commit the PDT could end up being
 ; required and calculated via BlockFrequency analysis. So the problem was that
 ; when setting up the DomTreeUpdater we used a nullptr in case PDT wasn't
-; cached at the begininng of IPSCCP, to indicate that no updates where needed
+; cached at the beginning of IPSCCP, to indicate that no updates were needed
 ; for PDT. But then the PDT was calculated, given the input IR, and preserved
 ; using the non-updated state (as the DTU wasn't configured for updating the
 ; PDT).
 
 ; CHECK-NOT: <badref>
 ; CHECK: Inorder PostDominator Tree: DFSNumbers invalid: 0 slow queries.
-; CHECK-NEXT:   [1]  <<exit node>> {4294967295,4294967295} [0]
-; CHECK-NEXT:     [2] %for.cond34 {4294967295,4294967295} [1]
-; CHECK-NEXT:       [3] %for.cond16 {4294967295,4294967295} [2]
-; CHECK-NEXT:     [2] %for.body {4294967295,4294967295} [1]
-; CHECK-NEXT:     [2] %if.end4 {4294967295,4294967295} [1]
-; CHECK-NEXT:       [3] %entry {4294967295,4294967295} [2]
-; CHECK-NEXT: Roots: %for.cond34 %for.body
+; CHECK-NEXT:  [1]  <<exit node>> {4294967295,4294967295} [0]
+; CHECK-NEXT:    [2] %for.body {4294967295,4294967295} [1]
+; CHECK-NEXT:    [2] %if.end4 {4294967295,4294967295} [1]
+; CHECK-NEXT:      [3] %entry {4294967295,4294967295} [2]
+; CHECK-NEXT:    [2] %for.cond34 {4294967295,4294967295} [1]
+; CHECK-NEXT:      [3] %for.cond16 {4294967295,4294967295} [2]
+; CHECK-NEXT: Roots: %for.body %for.cond34
 ; CHECK-NEXT: PostDominatorTree for function: bar
 ; CHECK-NOT: <badref>
 
diff --git a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
index b0ff55489e1762..9f2053948cb33b 100644
--- a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
+++ b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
@@ -43,6 +43,7 @@ class FunctionSpecializationTest : public testing::Test {
   FunctionAnalysisManager FAM;
   std::unique_ptr<Module> M;
   std::unique_ptr<SCCPSolver> Solver;
+  SmallVector<Instruction *, 8> Instructions;
 
   FunctionSpecializationTest() {
     FAM.registerPass([&] { return TargetLibraryAnalysis(); });
@@ -97,21 +98,29 @@ class FunctionSpecializationTest : public testing::Test {
                                GetAC);
   }
 
-  Bonus getInstCost(Instruction &I, bool SizeOnly = false) {
+  Cost getInstCodeSize(Instruction &I, bool SizeOnly = false) {
     auto &TTI = FAM.getResult<TargetIRAnalysis>(*I.getFunction());
-    auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*I.getFunction());
 
     Cost CodeSize =
         TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
 
-    Cost Latency =
-        SizeOnly
-            ? 0
-            : BFI.getBlockFreq(I.getParent()).getFrequency() /
-                  BFI.getEntryFreq().getFrequency() *
-                  TTI.getInstructionCost(&I, TargetTransformInfo::TCK_Latency);
+    if (!SizeOnly)
+      Instructions.push_back(&I);
 
-    return {CodeSize, Latency};
+    return CodeSize;
+  }
+
+  Cost getLatency(Function *F) {
+    auto &TTI = FAM.getResult<TargetIRAnalysis>(*F);
+    auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*F);
+
+    Cost Latency = 0;
+    for (const Instruction *I : Instructions)
+      Latency += BFI.getBlockFreq(I->getParent()).getFrequency() /
+                 BFI.getEntryFreq().getFrequency() *
+                 TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+
+    return Latency;
   }
 };
 
@@ -171,25 +180,30 @@ TEST_F(FunctionSpecializationTest, SwitchInst) {
   Instruction &BrLoop = BB2.back();
 
   // mul
-  Bonus Ref = getInstCost(Mul);
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
+  Cost Ref = getInstCodeSize(Mul);
+  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // and + or + add
-  Ref = getInstCost(And) + getInstCost(Or) + getInstCost(Add);
-  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
+  Ref = getInstCodeSize(And) + getInstCodeSize(Or) + getInstCodeSize(Add);
+  Test = Visitor.getCodeSizeBonus(F->getArg(1), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // switch + sdiv + br + br
-  Ref = getInstCost(Switch) +
-        getInstCost(Sdiv, /*SizeOnly =*/ true) +
-        getInstCost(BrBB2, /*SizeOnly =*/ true) +
-        getInstCost(BrLoop, /*SizeOnly =*/ true);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), One);
+  Ref = getInstCodeSize(Switch) + getInstCodeSize(Sdiv, /*SizeOnly =*/true) +
+        getInstCodeSize(BrBB2, /*SizeOnly =*/true) +
+        getInstCodeSize(BrLoop, /*SizeOnly =*/true);
+  Test = Visitor.getCodeSizeBonus(F->getArg(2), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
+
+  // Latency.
+  Ref = getLatency(F);
+  Test = Visitor.getLatencyBonus();
+  EXPECT_EQ(Test, Ref);
+  EXPECT_TRUE(Test > 0);
 }
 
 TEST_F(FunctionSpecializationTest, BranchInst) {
@@ -238,27 +252,31 @@ TEST_F(FunctionSpecializationTest, BranchInst) {
   Instruction &BrLoop = BB2.front();
 
   // mul
-  Bonus Ref = getInstCost(Mul);
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
+  Cost Ref = getInstCodeSize(Mul);
+  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // add
-  Ref = getInstCost(Add);
-  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
+  Ref = getInstCodeSize(Add);
+  Test = Visitor.getCodeSizeBonus(F->getArg(1), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // branch + sub + br + sdiv + br
-  Ref = getInstCost(Branch) +
-        getInstCost(Sub, /*SizeOnly =*/ true) +
-        getInstCost(BrBB1BB2) +
-        getInstCost(Sdiv, /*SizeOnly =*/ true) +
-        getInstCost(BrBB2, /*SizeOnly =*/ true) +
-        getInstCost(BrLoop, /*SizeOnly =*/ true);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), False);
+  Ref = getInstCodeSize(Branch) + getInstCodeSize(Sub, /*SizeOnly =*/true) +
+        getInstCodeSize(BrBB1BB2) + getInstCodeSize(Sdiv, /*SizeOnly =*/true) +
+        getInstCodeSize(BrBB2, /*SizeOnly =*/true) +
+        getInstCodeSize(BrLoop, /*SizeOnly =*/true);
+  Test = Visitor.getCodeSizeBonus(F->getArg(2), False);
+  EXPECT_EQ(Test, Ref);
+  EXPECT_TRUE(Test > 0);
+
+  // Latency.
+  Ref = getLatency(F);
+  Test = Visitor.getLatencyBonus();
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 }
 
 TEST_F(FunctionSpecializationTest, SelectInst) {
@@ -279,14 +297,22 @@ TEST_F(FunctionSpecializationTest, SelectInst) {
   Constant *False = ConstantInt::getFalse(M.getContext());
   Instruction &Select = *F->front().begin();
 
-  Bonus Ref = getInstCost(Select);
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), False);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
-  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), Zero);
-  EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  Cost RefCodeSize = getInstCodeSize(Select);
+  Cost RefLatency = getLatency(F);
+
+  Cost TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(0), False);
+  EXPECT_TRUE(TestCodeSize == 0);
+  TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(1), One);
+  EXPECT_TRUE(TestCodeSize == 0);
+  Cost TestLatency = Visitor.getLatencyBonus();
+  EXPECT_TRUE(TestLatency == 0);
+
+  TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(2), Zero);
+  EXPECT_EQ(TestCodeSize, RefCodeSize);
+  EXPECT_TRUE(TestCodeSize > 0);
+  TestLatency = Visitor.getLatencyBonus();
+  EXPECT_EQ(TestLatency, RefLatency);
+  EXPECT_TRUE(TestLatency > 0);
 }
 
 TEST_F(FunctionSpecializationTest, Misc) {
@@ -332,26 +358,32 @@ TEST_F(FunctionSpecializationTest, Misc) {
   Instruction &Smax = *BlockIter++;
 
   // icmp + zext
-  Bonus Ref = getInstCost(Icmp) + getInstCost(Zext);
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
+  Cost Ref = getInstCodeSize(Icmp) + getInstCodeSize(Zext);
+  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // select
-  Ref = getInstCost(Select);
-  Test = Visitor.getSpecializationBonus(F->getArg(1), True);
+  Ref = getInstCodeSize(Select);
+  Test = Visitor.getCodeSizeBonus(F->getArg(1), True);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // gep + load + freeze + smax
-  Ref = getInstCost(Gep) + getInstCost(Load) + getInstCost(Freeze) +
-        getInstCost(Smax);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), GV);
+  Ref = getInstCodeSize(Gep) + getInstCodeSize(Load) + getInstCodeSize(Freeze) +
+        getInstCodeSize(Smax);
+  Test = Visitor.getCodeSizeBonus(F->getArg(2), GV);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
-  Test = Visitor.getSpecializationBonus(F->getArg(3), Undef);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
+  Test = Visitor.getCodeSizeBonus(F->getArg(3), Undef);
+  EXPECT_TRUE(Test == 0);
+
+  // Latency.
+  Ref = getLatency(F);
+  Test = Visitor.getLatencyBonus();
+  EXPECT_EQ(Test, Ref);
+  EXPECT_TRUE(Test > 0);
 }
 
 TEST_F(FunctionSpecializationTest, PhiNode) {
@@ -401,25 +433,34 @@ TEST_F(FunctionSpecializationTest, PhiNode) {
   Instruction &Icmp = *++BB.begin();
   Instruction &Branch = BB.back();
 
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
+  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
+  EXPECT_TRUE(Test == 0);
 
-  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
+  Test = Visitor.getCodeSizeBonus(F->getArg(1), One);
+  EXPECT_TRUE(Test == 0);
+
+  Test = Visitor.getLatencyBonus();
+  EXPECT_TRUE(Test == 0);
 
   // switch + phi + br
-  Bonus Ref = getInstCost(Switch...
[truncated]

llvmbot · 2024-10-21T12:40:23Z

@llvm/pr-subscribers-function-specialization

Author: Hari Limaye (hazzlim)

Changes

Only compute the Latency component of a specialisation's Bonus when necessary, to avoid unnecessarily computing the Block Frequency Information for a Function.

Patch is 21.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113159.diff

4 Files Affected:

(modified) llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h (+13-10)
(modified) llvm/lib/Transforms/IPO/FunctionSpecialization.cpp (+51-34)
(modified) llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll (+9-9)
(modified) llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp (+109-68)

diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
index b001771951e0fe..a50eafe48293ad 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
@@ -173,8 +173,9 @@ struct Bonus {
 };
 
 class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
+  std::function<BlockFrequencyInfo &(Function &)> GetBFI;
+  Function *F;
   const DataLayout &DL;
-  BlockFrequencyInfo &BFI;
   TargetTransformInfo &TTI;
   SCCPSolver &Solver;
 
@@ -192,17 +193,20 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   ConstMap::iterator LastVisited;
 
 public:
-  InstCostVisitor(const DataLayout &DL, BlockFrequencyInfo &BFI,
-                  TargetTransformInfo &TTI, SCCPSolver &Solver)
-      : DL(DL), BFI(BFI), TTI(TTI), Solver(Solver) {}
+  InstCostVisitor(std::function<BlockFrequencyInfo &(Function &)> GetBFI,
+                  Function *F, const DataLayout &DL, TargetTransformInfo &TTI,
+                  SCCPSolver &Solver)
+      : GetBFI(GetBFI), F(F), DL(DL), TTI(TTI), Solver(Solver) {}
 
   bool isBlockExecutable(BasicBlock *BB) {
     return Solver.isBlockExecutable(BB) && !DeadBlocks.contains(BB);
   }
 
-  Bonus getSpecializationBonus(Argument *A, Constant *C);
+  Cost getCodeSizeBonus(Argument *A, Constant *C);
+
+  Cost getCodeSizeBonusFromPendingPHIs();
 
-  Bonus getBonusFromPendingPHIs();
+  Cost getLatencyBonus();
 
 private:
   friend class InstVisitor<InstCostVisitor, Constant *>;
@@ -210,8 +214,8 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   static bool canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
                                     DenseSet<BasicBlock *> &DeadBlocks);
 
-  Bonus getUserBonus(Instruction *User, Value *Use = nullptr,
-                     Constant *C = nullptr);
+  Cost getUserCodeSizeBonus(Instruction *User, Value *Use = nullptr,
+                            Constant *C = nullptr);
 
   Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList);
   Cost estimateSwitchInst(SwitchInst &I);
@@ -283,9 +287,8 @@ class FunctionSpecializer {
   bool run();
 
   InstCostVisitor getInstCostVisitorFor(Function *F) {
-    auto &BFI = GetBFI(*F);
     auto &TTI = GetTTI(*F);
-    return InstCostVisitor(M.getDataLayout(), BFI, TTI, Solver);
+    return InstCostVisitor(GetBFI, F, M.getDataLayout(), TTI, Solver);
   }
 
 private:
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 7feebbe420ae53..38c10c29cf4e3a 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -112,7 +112,7 @@ bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
 Cost InstCostVisitor::estimateBasicBlocks(
                           SmallVectorImpl<BasicBlock *> &WorkList) {
   Cost CodeSize = 0;
-  // Accumulate the instruction cost of each basic block weighted by frequency.
+  // Accumulate the codesize savings of each basic block.
   while (!WorkList.empty()) {
     BasicBlock *BB = WorkList.pop_back_val();
 
@@ -154,37 +154,55 @@ static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) {
   return KnownConstants.lookup(V);
 }
 
-Bonus InstCostVisitor::getBonusFromPendingPHIs() {
-  Bonus B;
+Cost InstCostVisitor::getCodeSizeBonusFromPendingPHIs() {
+  Cost CodeSize;
   while (!PendingPHIs.empty()) {
     Instruction *Phi = PendingPHIs.pop_back_val();
     // The pending PHIs could have been proven dead by now.
     if (isBlockExecutable(Phi->getParent()))
-      B += getUserBonus(Phi);
+      CodeSize += getUserCodeSizeBonus(Phi);
   }
-  return B;
+  return CodeSize;
 }
 
-/// Compute a bonus for replacing argument \p A with constant \p C.
-Bonus InstCostVisitor::getSpecializationBonus(Argument *A, Constant *C) {
+/// Compute the codesize savings for replacing argument \p A with constant \p C.
+Cost InstCostVisitor::getCodeSizeBonus(Argument *A, Constant *C) {
   LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
                     << C->getNameOrAsOperand() << "\n");
-  Bonus B;
+  Cost CodeSize;
   for (auto *U : A->users())
     if (auto *UI = dyn_cast<Instruction>(U))
       if (isBlockExecutable(UI->getParent()))
-        B += getUserBonus(UI, A, C);
+        CodeSize += getUserCodeSizeBonus(UI, A, C);
 
   LLVM_DEBUG(dbgs() << "FnSpecialization:   Accumulated bonus {CodeSize = "
-                    << B.CodeSize << ", Latency = " << B.Latency
-                    << "} for argument " << *A << "\n");
-  return B;
+                    << CodeSize << "} for argument " << *A << "\n");
+  return CodeSize;
+}
+
+Cost InstCostVisitor::getLatencyBonus() {
+  auto &BFI = GetBFI(*F);
+  Cost Latency = 0;
+
+  for (auto Pair : KnownConstants) {
+    Instruction *I = dyn_cast<Instruction>(Pair.first);
+    if (!I)
+      continue;
+
+    uint64_t Weight = BFI.getBlockFreq(I->getParent()).getFrequency() /
+                      BFI.getEntryFreq().getFrequency();
+    Latency +=
+        Weight * TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+  }
+
+  return Latency;
 }
 
-Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
+Cost InstCostVisitor::getUserCodeSizeBonus(Instruction *User, Value *Use,
+                                           Constant *C) {
   // We have already propagated a constant for this user.
   if (KnownConstants.contains(User))
-    return {0, 0};
+    return 0;
 
   // Cache the iterator before visiting.
   LastVisited = Use ? KnownConstants.insert({Use, C}).first
@@ -198,7 +216,7 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C)
   } else {
     C = visit(*User);
     if (!C)
-      return {0, 0};
+      return 0;
   }
 
   // Even though it doesn't make sense to bind switch and branch instructions
@@ -208,23 +226,15 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C)
 
   CodeSize += TTI.getInstructionCost(User, TargetTransformInfo::TCK_CodeSize);
 
-  uint64_t Weight = BFI.getBlockFreq(User->getParent()).getFrequency() /
-                    BFI.getEntryFreq().getFrequency();
-
-  Cost Latency = Weight *
-      TTI.getInstructionCost(User, TargetTransformInfo::TCK_Latency);
-
   LLVM_DEBUG(dbgs() << "FnSpecialization:     {CodeSize = " << CodeSize
-                    << ", Latency = " << Latency << "} for user "
-                    << *User << "\n");
+                    << "} for user " << *User << "\n");
 
-  Bonus B(CodeSize, Latency);
   for (auto *U : User->users())
     if (auto *UI = dyn_cast<Instruction>(U))
       if (UI != User && isBlockExecutable(UI->getParent()))
-        B += getUserBonus(UI, User, C);
+        CodeSize += getUserCodeSizeBonus(UI, User, C);
 
-  return B;
+  return CodeSize;
 }
 
 Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
@@ -875,24 +885,23 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
       AllSpecs[Index].CallSites.push_back(&CS);
     } else {
       // Calculate the specialisation gain.
-      Bonus B;
+      Cost CodeSize;
       unsigned Score = 0;
       InstCostVisitor Visitor = getInstCostVisitorFor(F);
       for (ArgInfo &A : S.Args) {
-        B += Visitor.getSpecializationBonus(A.Formal, A.Actual);
+        CodeSize += Visitor.getCodeSizeBonus(A.Formal, A.Actual);
         Score += getInliningBonus(A.Formal, A.Actual);
       }
-      B += Visitor.getBonusFromPendingPHIs();
-
+      CodeSize += Visitor.getCodeSizeBonusFromPendingPHIs();
 
       LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
-                        << B.CodeSize << ", Latency = " << B.Latency
-                        << ", Inlining = " << Score << "}\n");
+                        << CodeSize << ", Inlining = " << Score << "}\n");
 
+      Bonus B = {CodeSize, 0};
       FunctionGrowth[F] += FuncSize - B.CodeSize;
 
       auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize,
-                             unsigned FuncGrowth) -> bool {
+                             unsigned FuncGrowth, InstCostVisitor &V) -> bool {
         // No check required.
         if (ForceSpecialization)
           return true;
@@ -902,6 +911,14 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
         // Minimum codesize savings.
         if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100)
           return false;
+
+        // Lazily compute the Latency, to avoid unnecessarily computing BFI.
+        B += {0, V.getLatencyBonus()};
+
+        LLVM_DEBUG(
+            dbgs() << "FnSpecialization: Specialization bonus {Latency = "
+                   << B.Latency << "}\n");
+
         // Minimum latency savings.
         if (B.Latency < MinLatencySavings * FuncSize / 100)
           return false;
@@ -912,7 +929,7 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
       };
 
       // Discard unprofitable specialisations.
-      if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F]))
+      if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F], Visitor))
         continue;
 
       // Create a new specialisation entry.
diff --git a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
index ff57569d127884..f8c8e33dfc2335 100644
--- a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
+++ b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
@@ -4,25 +4,25 @@
 
 ; This test case is trying to validate that the postdomtree is preserved
 ; correctly by the ipsccp pass. A tricky bug was introduced in commit
-; 1b1232047e83b69561 when PDT would be feched using getCachedAnalysis in order
+; 1b1232047e83b69561 when PDT would be fetched using getCachedAnalysis in order
 ; to setup a DomTreeUpdater (to update the PDT during transformation in order
 ; to preserve the analysis). But given that commit the PDT could end up being
 ; required and calculated via BlockFrequency analysis. So the problem was that
 ; when setting up the DomTreeUpdater we used a nullptr in case PDT wasn't
-; cached at the begininng of IPSCCP, to indicate that no updates where needed
+; cached at the beginning of IPSCCP, to indicate that no updates were needed
 ; for PDT. But then the PDT was calculated, given the input IR, and preserved
 ; using the non-updated state (as the DTU wasn't configured for updating the
 ; PDT).
 
 ; CHECK-NOT: <badref>
 ; CHECK: Inorder PostDominator Tree: DFSNumbers invalid: 0 slow queries.
-; CHECK-NEXT:   [1]  <<exit node>> {4294967295,4294967295} [0]
-; CHECK-NEXT:     [2] %for.cond34 {4294967295,4294967295} [1]
-; CHECK-NEXT:       [3] %for.cond16 {4294967295,4294967295} [2]
-; CHECK-NEXT:     [2] %for.body {4294967295,4294967295} [1]
-; CHECK-NEXT:     [2] %if.end4 {4294967295,4294967295} [1]
-; CHECK-NEXT:       [3] %entry {4294967295,4294967295} [2]
-; CHECK-NEXT: Roots: %for.cond34 %for.body
+; CHECK-NEXT:  [1]  <<exit node>> {4294967295,4294967295} [0]
+; CHECK-NEXT:    [2] %for.body {4294967295,4294967295} [1]
+; CHECK-NEXT:    [2] %if.end4 {4294967295,4294967295} [1]
+; CHECK-NEXT:      [3] %entry {4294967295,4294967295} [2]
+; CHECK-NEXT:    [2] %for.cond34 {4294967295,4294967295} [1]
+; CHECK-NEXT:      [3] %for.cond16 {4294967295,4294967295} [2]
+; CHECK-NEXT: Roots: %for.body %for.cond34
 ; CHECK-NEXT: PostDominatorTree for function: bar
 ; CHECK-NOT: <badref>
 
diff --git a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
index b0ff55489e1762..9f2053948cb33b 100644
--- a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
+++ b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
@@ -43,6 +43,7 @@ class FunctionSpecializationTest : public testing::Test {
   FunctionAnalysisManager FAM;
   std::unique_ptr<Module> M;
   std::unique_ptr<SCCPSolver> Solver;
+  SmallVector<Instruction *, 8> Instructions;
 
   FunctionSpecializationTest() {
     FAM.registerPass([&] { return TargetLibraryAnalysis(); });
@@ -97,21 +98,29 @@ class FunctionSpecializationTest : public testing::Test {
                                GetAC);
   }
 
-  Bonus getInstCost(Instruction &I, bool SizeOnly = false) {
+  Cost getInstCodeSize(Instruction &I, bool SizeOnly = false) {
     auto &TTI = FAM.getResult<TargetIRAnalysis>(*I.getFunction());
-    auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*I.getFunction());
 
     Cost CodeSize =
         TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
 
-    Cost Latency =
-        SizeOnly
-            ? 0
-            : BFI.getBlockFreq(I.getParent()).getFrequency() /
-                  BFI.getEntryFreq().getFrequency() *
-                  TTI.getInstructionCost(&I, TargetTransformInfo::TCK_Latency);
+    if (!SizeOnly)
+      Instructions.push_back(&I);
 
-    return {CodeSize, Latency};
+    return CodeSize;
+  }
+
+  Cost getLatency(Function *F) {
+    auto &TTI = FAM.getResult<TargetIRAnalysis>(*F);
+    auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*F);
+
+    Cost Latency = 0;
+    for (const Instruction *I : Instructions)
+      Latency += BFI.getBlockFreq(I->getParent()).getFrequency() /
+                 BFI.getEntryFreq().getFrequency() *
+                 TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+
+    return Latency;
   }
 };
 
@@ -171,25 +180,30 @@ TEST_F(FunctionSpecializationTest, SwitchInst) {
   Instruction &BrLoop = BB2.back();
 
   // mul
-  Bonus Ref = getInstCost(Mul);
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
+  Cost Ref = getInstCodeSize(Mul);
+  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // and + or + add
-  Ref = getInstCost(And) + getInstCost(Or) + getInstCost(Add);
-  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
+  Ref = getInstCodeSize(And) + getInstCodeSize(Or) + getInstCodeSize(Add);
+  Test = Visitor.getCodeSizeBonus(F->getArg(1), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // switch + sdiv + br + br
-  Ref = getInstCost(Switch) +
-        getInstCost(Sdiv, /*SizeOnly =*/ true) +
-        getInstCost(BrBB2, /*SizeOnly =*/ true) +
-        getInstCost(BrLoop, /*SizeOnly =*/ true);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), One);
+  Ref = getInstCodeSize(Switch) + getInstCodeSize(Sdiv, /*SizeOnly =*/true) +
+        getInstCodeSize(BrBB2, /*SizeOnly =*/true) +
+        getInstCodeSize(BrLoop, /*SizeOnly =*/true);
+  Test = Visitor.getCodeSizeBonus(F->getArg(2), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
+
+  // Latency.
+  Ref = getLatency(F);
+  Test = Visitor.getLatencyBonus();
+  EXPECT_EQ(Test, Ref);
+  EXPECT_TRUE(Test > 0);
 }
 
 TEST_F(FunctionSpecializationTest, BranchInst) {
@@ -238,27 +252,31 @@ TEST_F(FunctionSpecializationTest, BranchInst) {
   Instruction &BrLoop = BB2.front();
 
   // mul
-  Bonus Ref = getInstCost(Mul);
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
+  Cost Ref = getInstCodeSize(Mul);
+  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // add
-  Ref = getInstCost(Add);
-  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
+  Ref = getInstCodeSize(Add);
+  Test = Visitor.getCodeSizeBonus(F->getArg(1), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // branch + sub + br + sdiv + br
-  Ref = getInstCost(Branch) +
-        getInstCost(Sub, /*SizeOnly =*/ true) +
-        getInstCost(BrBB1BB2) +
-        getInstCost(Sdiv, /*SizeOnly =*/ true) +
-        getInstCost(BrBB2, /*SizeOnly =*/ true) +
-        getInstCost(BrLoop, /*SizeOnly =*/ true);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), False);
+  Ref = getInstCodeSize(Branch) + getInstCodeSize(Sub, /*SizeOnly =*/true) +
+        getInstCodeSize(BrBB1BB2) + getInstCodeSize(Sdiv, /*SizeOnly =*/true) +
+        getInstCodeSize(BrBB2, /*SizeOnly =*/true) +
+        getInstCodeSize(BrLoop, /*SizeOnly =*/true);
+  Test = Visitor.getCodeSizeBonus(F->getArg(2), False);
+  EXPECT_EQ(Test, Ref);
+  EXPECT_TRUE(Test > 0);
+
+  // Latency.
+  Ref = getLatency(F);
+  Test = Visitor.getLatencyBonus();
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 }
 
 TEST_F(FunctionSpecializationTest, SelectInst) {
@@ -279,14 +297,22 @@ TEST_F(FunctionSpecializationTest, SelectInst) {
   Constant *False = ConstantInt::getFalse(M.getContext());
   Instruction &Select = *F->front().begin();
 
-  Bonus Ref = getInstCost(Select);
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), False);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
-  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), Zero);
-  EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  Cost RefCodeSize = getInstCodeSize(Select);
+  Cost RefLatency = getLatency(F);
+
+  Cost TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(0), False);
+  EXPECT_TRUE(TestCodeSize == 0);
+  TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(1), One);
+  EXPECT_TRUE(TestCodeSize == 0);
+  Cost TestLatency = Visitor.getLatencyBonus();
+  EXPECT_TRUE(TestLatency == 0);
+
+  TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(2), Zero);
+  EXPECT_EQ(TestCodeSize, RefCodeSize);
+  EXPECT_TRUE(TestCodeSize > 0);
+  TestLatency = Visitor.getLatencyBonus();
+  EXPECT_EQ(TestLatency, RefLatency);
+  EXPECT_TRUE(TestLatency > 0);
 }
 
 TEST_F(FunctionSpecializationTest, Misc) {
@@ -332,26 +358,32 @@ TEST_F(FunctionSpecializationTest, Misc) {
   Instruction &Smax = *BlockIter++;
 
   // icmp + zext
-  Bonus Ref = getInstCost(Icmp) + getInstCost(Zext);
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
+  Cost Ref = getInstCodeSize(Icmp) + getInstCodeSize(Zext);
+  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // select
-  Ref = getInstCost(Select);
-  Test = Visitor.getSpecializationBonus(F->getArg(1), True);
+  Ref = getInstCodeSize(Select);
+  Test = Visitor.getCodeSizeBonus(F->getArg(1), True);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // gep + load + freeze + smax
-  Ref = getInstCost(Gep) + getInstCost(Load) + getInstCost(Freeze) +
-        getInstCost(Smax);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), GV);
+  Ref = getInstCodeSize(Gep) + getInstCodeSize(Load) + getInstCodeSize(Freeze) +
+        getInstCodeSize(Smax);
+  Test = Visitor.getCodeSizeBonus(F->getArg(2), GV);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
-  Test = Visitor.getSpecializationBonus(F->getArg(3), Undef);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
+  Test = Visitor.getCodeSizeBonus(F->getArg(3), Undef);
+  EXPECT_TRUE(Test == 0);
+
+  // Latency.
+  Ref = getLatency(F);
+  Test = Visitor.getLatencyBonus();
+  EXPECT_EQ(Test, Ref);
+  EXPECT_TRUE(Test > 0);
 }
 
 TEST_F(FunctionSpecializationTest, PhiNode) {
@@ -401,25 +433,34 @@ TEST_F(FunctionSpecializationTest, PhiNode) {
   Instruction &Icmp = *++BB.begin();
   Instruction &Branch = BB.back();
 
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
+  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
+  EXPECT_TRUE(Test == 0);
 
-  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
+  Test = Visitor.getCodeSizeBonus(F->getArg(1), One);
+  EXPECT_TRUE(Test == 0);
+
+  Test = Visitor.getLatencyBonus();
+  EXPECT_TRUE(Test == 0);
 
   // switch + phi + br
-  Bonus Ref = getInstCost(Switch...
[truncated]

hazzlim · 2024-10-21T12:41:53Z

Change in compile-times for CTMark (measured as instruction-count) when SpecializeLiteralConstant is enabled:

O3 (Non-LTO):

Benchmark        |  Change  |
-----------------+----------+
ClamAV           |  -0.03%  |
7zip             |  -0.01%  |
kimwitu++        |  -0.01%  |
tramp3d-v4       |   0.00%  |
sqlite3          |  -0.03%  |
mafft            |  -0.01%  |
SPASS            |  -0.01%  |
lencod           |  -0.01%  |
consumer-typeset |  -0.14%  |
Bullet           |  -0.01%  |
-----------------+----------+
GEOMEAN          |  -0.03%  |

ReleaseLTO-g:

Benchmark        |  Change  |
-----------------+----------+
ClamAV           |  -0.03%  |
7zip             |  +0.01%  |
kimwitu++        |  -0.02%  |
tramp3d-v4       |  +0.01%  |
sqlite3          |  -0.06%  |
mafft            |  +0.03%  |
SPASS            |  -0.01%  |
lencod           |   0.00%  |
consumer-typeset |  -0.08%  |
Bullet           |  +0.01%  |
-----------------+----------+
GEOMEAN          |  -0.01%  |

This patch has the largest effect on the consumer-typeset benchmark, which exhibits compile-time regressions (+0.14% for O3, +0.12% for ReleaseLTO-g) when SpecializeLiteralConstant is enabled despite no specializations being created.
With this patch applied the compile-time changes for consumer-typeset when SpecializeLiteralConstant is enabled are -0.01% for O3 and +0.04% for ReleaseLTO-g.