From d3823156e7d19413f76c3b1943e3d0d4b31bf09c Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye@arm.com>
Date: Wed, 16 Oct 2024 23:25:54 +0000
Subject: [PATCH 1/4] [FuncSpec] Only compute Latency bonus when necessary

Only compute the Latency component of a specialisation's Bonus when
necessary, to avoid unnecessarily computing the Block Frequency
Information for a Function.
---
 .../Transforms/IPO/FunctionSpecialization.h   |  23 ++-
 .../Transforms/IPO/FunctionSpecialization.cpp |  85 +++++----
 .../Transforms/SCCP/ipsccp-preserve-pdt.ll    |  18 +-
 .../IPO/FunctionSpecializationTest.cpp        | 177 +++++++++++-------
 4 files changed, 182 insertions(+), 121 deletions(-)
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
index b001771951e0f..a50eafe48293a 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
@@ -173,8 +173,9 @@ struct Bonus {
 };
 
 class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
+  std::function<BlockFrequencyInfo &(Function &)> GetBFI;
+  Function *F;
   const DataLayout &DL;
-  BlockFrequencyInfo &BFI;
   TargetTransformInfo &TTI;
   SCCPSolver &Solver;
 
@@ -192,17 +193,20 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   ConstMap::iterator LastVisited;
 
 public:
-  InstCostVisitor(const DataLayout &DL, BlockFrequencyInfo &BFI,
-                  TargetTransformInfo &TTI, SCCPSolver &Solver)
-      : DL(DL), BFI(BFI), TTI(TTI), Solver(Solver) {}
+  InstCostVisitor(std::function<BlockFrequencyInfo &(Function &)> GetBFI,
+                  Function *F, const DataLayout &DL, TargetTransformInfo &TTI,
+                  SCCPSolver &Solver)
+      : GetBFI(GetBFI), F(F), DL(DL), TTI(TTI), Solver(Solver) {}
 
   bool isBlockExecutable(BasicBlock *BB) {
     return Solver.isBlockExecutable(BB) && !DeadBlocks.contains(BB);
   }
 
-  Bonus getSpecializationBonus(Argument *A, Constant *C);
+  Cost getCodeSizeBonus(Argument *A, Constant *C);
+
+  Cost getCodeSizeBonusFromPendingPHIs();
 
-  Bonus getBonusFromPendingPHIs();
+  Cost getLatencyBonus();
 
 private:
   friend class InstVisitor<InstCostVisitor, Constant *>;
@@ -210,8 +214,8 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   static bool canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
                                     DenseSet<BasicBlock *> &DeadBlocks);
 
-  Bonus getUserBonus(Instruction *User, Value *Use = nullptr,
-                     Constant *C = nullptr);
+  Cost getUserCodeSizeBonus(Instruction *User, Value *Use = nullptr,
+                            Constant *C = nullptr);
 
   Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList);
   Cost estimateSwitchInst(SwitchInst &I);
@@ -283,9 +287,8 @@ class FunctionSpecializer {
   bool run();
 
   InstCostVisitor getInstCostVisitorFor(Function *F) {
-    auto &BFI = GetBFI(*F);
     auto &TTI = GetTTI(*F);
-    return InstCostVisitor(M.getDataLayout(), BFI, TTI, Solver);
+    return InstCostVisitor(GetBFI, F, M.getDataLayout(), TTI, Solver);
   }
 
 private:
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 7feebbe420ae5..38c10c29cf4e3 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -112,7 +112,7 @@ bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
 Cost InstCostVisitor::estimateBasicBlocks(
                           SmallVectorImpl<BasicBlock *> &WorkList) {
   Cost CodeSize = 0;
-  // Accumulate the instruction cost of each basic block weighted by frequency.
+  // Accumulate the codesize savings of each basic block.
   while (!WorkList.empty()) {
     BasicBlock *BB = WorkList.pop_back_val();
 
@@ -154,37 +154,55 @@ static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) {
   return KnownConstants.lookup(V);
 }
 
-Bonus InstCostVisitor::getBonusFromPendingPHIs() {
-  Bonus B;
+Cost InstCostVisitor::getCodeSizeBonusFromPendingPHIs() {
+  Cost CodeSize;
   while (!PendingPHIs.empty()) {
     Instruction *Phi = PendingPHIs.pop_back_val();
     // The pending PHIs could have been proven dead by now.
     if (isBlockExecutable(Phi->getParent()))
-      B += getUserBonus(Phi);
+      CodeSize += getUserCodeSizeBonus(Phi);
   }
-  return B;
+  return CodeSize;
 }
 
-/// Compute a bonus for replacing argument \p A with constant \p C.
-Bonus InstCostVisitor::getSpecializationBonus(Argument *A, Constant *C) {
+/// Compute the codesize savings for replacing argument \p A with constant \p C.
+Cost InstCostVisitor::getCodeSizeBonus(Argument *A, Constant *C) {
   LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
                     << C->getNameOrAsOperand() << "\n");
-  Bonus B;
+  Cost CodeSize;
   for (auto *U : A->users())
     if (auto *UI = dyn_cast<Instruction>(U))
       if (isBlockExecutable(UI->getParent()))
-        B += getUserBonus(UI, A, C);
+        CodeSize += getUserCodeSizeBonus(UI, A, C);
 
   LLVM_DEBUG(dbgs() << "FnSpecialization:   Accumulated bonus {CodeSize = "
-                    << B.CodeSize << ", Latency = " << B.Latency
-                    << "} for argument " << *A << "\n");
-  return B;
+                    << CodeSize << "} for argument " << *A << "\n");
+  return CodeSize;
+}
+
+Cost InstCostVisitor::getLatencyBonus() {
+  auto &BFI = GetBFI(*F);
+  Cost Latency = 0;
+
+  for (auto Pair : KnownConstants) {
+    Instruction *I = dyn_cast<Instruction>(Pair.first);
+    if (!I)
+      continue;
+
+    uint64_t Weight = BFI.getBlockFreq(I->getParent()).getFrequency() /
+                      BFI.getEntryFreq().getFrequency();
+    Latency +=
+        Weight * TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+  }
+
+  return Latency;
 }
 
-Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
+Cost InstCostVisitor::getUserCodeSizeBonus(Instruction *User, Value *Use,
+                                           Constant *C) {
   // We have already propagated a constant for this user.
   if (KnownConstants.contains(User))
-    return {0, 0};
+    return 0;
 
   // Cache the iterator before visiting.
   LastVisited = Use ? KnownConstants.insert({Use, C}).first
@@ -198,7 +216,7 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C)
   } else {
     C = visit(*User);
     if (!C)
-      return {0, 0};
+      return 0;
   }
 
   // Even though it doesn't make sense to bind switch and branch instructions
@@ -208,23 +226,15 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C)
 
   CodeSize += TTI.getInstructionCost(User, TargetTransformInfo::TCK_CodeSize);
 
-  uint64_t Weight = BFI.getBlockFreq(User->getParent()).getFrequency() /
-                    BFI.getEntryFreq().getFrequency();
-
-  Cost Latency = Weight *
-      TTI.getInstructionCost(User, TargetTransformInfo::TCK_Latency);
-
   LLVM_DEBUG(dbgs() << "FnSpecialization:     {CodeSize = " << CodeSize
-                    << ", Latency = " << Latency << "} for user "
-                    << *User << "\n");
+                    << "} for user " << *User << "\n");
 
-  Bonus B(CodeSize, Latency);
   for (auto *U : User->users())
     if (auto *UI = dyn_cast<Instruction>(U))
       if (UI != User && isBlockExecutable(UI->getParent()))
-        B += getUserBonus(UI, User, C);
+        CodeSize += getUserCodeSizeBonus(UI, User, C);
 
-  return B;
+  return CodeSize;
 }
 
 Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
@@ -875,24 +885,23 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
       AllSpecs[Index].CallSites.push_back(&CS);
     } else {
       // Calculate the specialisation gain.
-      Bonus B;
+      Cost CodeSize;
       unsigned Score = 0;
       InstCostVisitor Visitor = getInstCostVisitorFor(F);
       for (ArgInfo &A : S.Args) {
-        B += Visitor.getSpecializationBonus(A.Formal, A.Actual);
+        CodeSize += Visitor.getCodeSizeBonus(A.Formal, A.Actual);
         Score += getInliningBonus(A.Formal, A.Actual);
       }
-      B += Visitor.getBonusFromPendingPHIs();
-
+      CodeSize += Visitor.getCodeSizeBonusFromPendingPHIs();
 
       LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
-                        << B.CodeSize << ", Latency = " << B.Latency
-                        << ", Inlining = " << Score << "}\n");
+                        << CodeSize << ", Inlining = " << Score << "}\n");
 
+      Bonus B = {CodeSize, 0};
       FunctionGrowth[F] += FuncSize - B.CodeSize;
 
       auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize,
-                             unsigned FuncGrowth) -> bool {
+                             unsigned FuncGrowth, InstCostVisitor &V) -> bool {
         // No check required.
         if (ForceSpecialization)
           return true;
@@ -902,6 +911,14 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
         // Minimum codesize savings.
         if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100)
           return false;
+
+        // Lazily compute the Latency, to avoid unnecessarily computing BFI.
+        B += {0, V.getLatencyBonus()};
+
+        LLVM_DEBUG(
+            dbgs() << "FnSpecialization: Specialization bonus {Latency = "
+                   << B.Latency << "}\n");
+
         // Minimum latency savings.
         if (B.Latency < MinLatencySavings * FuncSize / 100)
           return false;
@@ -912,7 +929,7 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
       };
 
       // Discard unprofitable specialisations.
-      if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F]))
+      if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F], Visitor))
         continue;
 
       // Create a new specialisation entry.
diff --git a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
index ff57569d12788..f8c8e33dfc233 100644
--- a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
+++ b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
@@ -4,25 +4,25 @@
 
 ; This test case is trying to validate that the postdomtree is preserved
 ; correctly by the ipsccp pass. A tricky bug was introduced in commit
-; 1b1232047e83b69561 when PDT would be feched using getCachedAnalysis in order
+; 1b1232047e83b69561 when PDT would be fetched using getCachedAnalysis in order
 ; to setup a DomTreeUpdater (to update the PDT during transformation in order
 ; to preserve the analysis). But given that commit the PDT could end up being
 ; required and calculated via BlockFrequency analysis. So the problem was that
 ; when setting up the DomTreeUpdater we used a nullptr in case PDT wasn't
-; cached at the begininng of IPSCCP, to indicate that no updates where needed
+; cached at the beginning of IPSCCP, to indicate that no updates were needed
 ; for PDT. But then the PDT was calculated, given the input IR, and preserved
 ; using the non-updated state (as the DTU wasn't configured for updating the
 ; PDT).
 
 ; CHECK-NOT: <badref>
 ; CHECK: Inorder PostDominator Tree: DFSNumbers invalid: 0 slow queries.
-; CHECK-NEXT:   [1]  <<exit node>> {4294967295,4294967295} [0]
-; CHECK-NEXT:     [2] %for.cond34 {4294967295,4294967295} [1]
-; CHECK-NEXT:       [3] %for.cond16 {4294967295,4294967295} [2]
-; CHECK-NEXT:     [2] %for.body {4294967295,4294967295} [1]
-; CHECK-NEXT:     [2] %if.end4 {4294967295,4294967295} [1]
-; CHECK-NEXT:       [3] %entry {4294967295,4294967295} [2]
-; CHECK-NEXT: Roots: %for.cond34 %for.body
+; CHECK-NEXT:  [1]  <<exit node>> {4294967295,4294967295} [0]
+; CHECK-NEXT:    [2] %for.body {4294967295,4294967295} [1]
+; CHECK-NEXT:    [2] %if.end4 {4294967295,4294967295} [1]
+; CHECK-NEXT:      [3] %entry {4294967295,4294967295} [2]
+; CHECK-NEXT:    [2] %for.cond34 {4294967295,4294967295} [1]
+; CHECK-NEXT:      [3] %for.cond16 {4294967295,4294967295} [2]
+; CHECK-NEXT: Roots: %for.body %for.cond34
 ; CHECK-NEXT: PostDominatorTree for function: bar
 ; CHECK-NOT: <badref>
 
diff --git a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
index b0ff55489e176..9f2053948cb33 100644
--- a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
+++ b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
@@ -43,6 +43,7 @@ class FunctionSpecializationTest : public testing::Test {
   FunctionAnalysisManager FAM;
   std::unique_ptr<Module> M;
   std::unique_ptr<SCCPSolver> Solver;
+  SmallVector<Instruction *, 8> Instructions;
 
   FunctionSpecializationTest() {
     FAM.registerPass([&] { return TargetLibraryAnalysis(); });
@@ -97,21 +98,29 @@ class FunctionSpecializationTest : public testing::Test {
                                GetAC);
   }
 
-  Bonus getInstCost(Instruction &I, bool SizeOnly = false) {
+  Cost getInstCodeSize(Instruction &I, bool SizeOnly = false) {
     auto &TTI = FAM.getResult<TargetIRAnalysis>(*I.getFunction());
-    auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*I.getFunction());
 
     Cost CodeSize =
         TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
 
-    Cost Latency =
-        SizeOnly
-            ? 0
-            : BFI.getBlockFreq(I.getParent()).getFrequency() /
-                  BFI.getEntryFreq().getFrequency() *
-                  TTI.getInstructionCost(&I, TargetTransformInfo::TCK_Latency);
+    if (!SizeOnly)
+      Instructions.push_back(&I);
 
-    return {CodeSize, Latency};
+    return CodeSize;
+  }
+
+  Cost getLatency(Function *F) {
+    auto &TTI = FAM.getResult<TargetIRAnalysis>(*F);
+    auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*F);
+
+    Cost Latency = 0;
+    for (const Instruction *I : Instructions)
+      Latency += BFI.getBlockFreq(I->getParent()).getFrequency() /
+                 BFI.getEntryFreq().getFrequency() *
+                 TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+
+    return Latency;
   }
 };
 
@@ -171,25 +180,30 @@ TEST_F(FunctionSpecializationTest, SwitchInst) {
   Instruction &BrLoop = BB2.back();
 
   // mul
-  Bonus Ref = getInstCost(Mul);
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
+  Cost Ref = getInstCodeSize(Mul);
+  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // and + or + add
-  Ref = getInstCost(And) + getInstCost(Or) + getInstCost(Add);
-  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
+  Ref = getInstCodeSize(And) + getInstCodeSize(Or) + getInstCodeSize(Add);
+  Test = Visitor.getCodeSizeBonus(F->getArg(1), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // switch + sdiv + br + br
-  Ref = getInstCost(Switch) +
-        getInstCost(Sdiv, /*SizeOnly =*/ true) +
-        getInstCost(BrBB2, /*SizeOnly =*/ true) +
-        getInstCost(BrLoop, /*SizeOnly =*/ true);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), One);
+  Ref = getInstCodeSize(Switch) + getInstCodeSize(Sdiv, /*SizeOnly =*/true) +
+        getInstCodeSize(BrBB2, /*SizeOnly =*/true) +
+        getInstCodeSize(BrLoop, /*SizeOnly =*/true);
+  Test = Visitor.getCodeSizeBonus(F->getArg(2), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
+
+  // Latency.
+  Ref = getLatency(F);
+  Test = Visitor.getLatencyBonus();
+  EXPECT_EQ(Test, Ref);
+  EXPECT_TRUE(Test > 0);
 }
 
 TEST_F(FunctionSpecializationTest, BranchInst) {
@@ -238,27 +252,31 @@ TEST_F(FunctionSpecializationTest, BranchInst) {
   Instruction &BrLoop = BB2.front();
 
   // mul
-  Bonus Ref = getInstCost(Mul);
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
+  Cost Ref = getInstCodeSize(Mul);
+  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // add
-  Ref = getInstCost(Add);
-  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
+  Ref = getInstCodeSize(Add);
+  Test = Visitor.getCodeSizeBonus(F->getArg(1), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // branch + sub + br + sdiv + br
-  Ref = getInstCost(Branch) +
-        getInstCost(Sub, /*SizeOnly =*/ true) +
-        getInstCost(BrBB1BB2) +
-        getInstCost(Sdiv, /*SizeOnly =*/ true) +
-        getInstCost(BrBB2, /*SizeOnly =*/ true) +
-        getInstCost(BrLoop, /*SizeOnly =*/ true);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), False);
+  Ref = getInstCodeSize(Branch) + getInstCodeSize(Sub, /*SizeOnly =*/true) +
+        getInstCodeSize(BrBB1BB2) + getInstCodeSize(Sdiv, /*SizeOnly =*/true) +
+        getInstCodeSize(BrBB2, /*SizeOnly =*/true) +
+        getInstCodeSize(BrLoop, /*SizeOnly =*/true);
+  Test = Visitor.getCodeSizeBonus(F->getArg(2), False);
+  EXPECT_EQ(Test, Ref);
+  EXPECT_TRUE(Test > 0);
+
+  // Latency.
+  Ref = getLatency(F);
+  Test = Visitor.getLatencyBonus();
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 }
 
 TEST_F(FunctionSpecializationTest, SelectInst) {
@@ -279,14 +297,22 @@ TEST_F(FunctionSpecializationTest, SelectInst) {
   Constant *False = ConstantInt::getFalse(M.getContext());
   Instruction &Select = *F->front().begin();
 
-  Bonus Ref = getInstCost(Select);
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), False);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
-  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), Zero);
-  EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  Cost RefCodeSize = getInstCodeSize(Select);
+  Cost RefLatency = getLatency(F);
+
+  Cost TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(0), False);
+  EXPECT_TRUE(TestCodeSize == 0);
+  TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(1), One);
+  EXPECT_TRUE(TestCodeSize == 0);
+  Cost TestLatency = Visitor.getLatencyBonus();
+  EXPECT_TRUE(TestLatency == 0);
+
+  TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(2), Zero);
+  EXPECT_EQ(TestCodeSize, RefCodeSize);
+  EXPECT_TRUE(TestCodeSize > 0);
+  TestLatency = Visitor.getLatencyBonus();
+  EXPECT_EQ(TestLatency, RefLatency);
+  EXPECT_TRUE(TestLatency > 0);
 }
 
 TEST_F(FunctionSpecializationTest, Misc) {
@@ -332,26 +358,32 @@ TEST_F(FunctionSpecializationTest, Misc) {
   Instruction &Smax = *BlockIter++;
 
   // icmp + zext
-  Bonus Ref = getInstCost(Icmp) + getInstCost(Zext);
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
+  Cost Ref = getInstCodeSize(Icmp) + getInstCodeSize(Zext);
+  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // select
-  Ref = getInstCost(Select);
-  Test = Visitor.getSpecializationBonus(F->getArg(1), True);
+  Ref = getInstCodeSize(Select);
+  Test = Visitor.getCodeSizeBonus(F->getArg(1), True);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
   // gep + load + freeze + smax
-  Ref = getInstCost(Gep) + getInstCost(Load) + getInstCost(Freeze) +
-        getInstCost(Smax);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), GV);
+  Ref = getInstCodeSize(Gep) + getInstCodeSize(Load) + getInstCodeSize(Freeze) +
+        getInstCodeSize(Smax);
+  Test = Visitor.getCodeSizeBonus(F->getArg(2), GV);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 
-  Test = Visitor.getSpecializationBonus(F->getArg(3), Undef);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
+  Test = Visitor.getCodeSizeBonus(F->getArg(3), Undef);
+  EXPECT_TRUE(Test == 0);
+
+  // Latency.
+  Ref = getLatency(F);
+  Test = Visitor.getLatencyBonus();
+  EXPECT_EQ(Test, Ref);
+  EXPECT_TRUE(Test > 0);
 }
 
 TEST_F(FunctionSpecializationTest, PhiNode) {
@@ -401,25 +433,34 @@ TEST_F(FunctionSpecializationTest, PhiNode) {
   Instruction &Icmp = *++BB.begin();
   Instruction &Branch = BB.back();
 
-  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
+  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
+  EXPECT_TRUE(Test == 0);
 
-  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
-  EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
+  Test = Visitor.getCodeSizeBonus(F->getArg(1), One);
+  EXPECT_TRUE(Test == 0);
+
+  Test = Visitor.getLatencyBonus();
+  EXPECT_TRUE(Test == 0);
 
   // switch + phi + br
-  Bonus Ref = getInstCost(Switch) +
-              getInstCost(PhiCase2, /*SizeOnly =*/ true) +
-              getInstCost(BrBB, /*SizeOnly =*/ true);
-  Test = Visitor.getSpecializationBonus(F->getArg(2), One);
+  Cost Ref = getInstCodeSize(Switch) +
+             getInstCodeSize(PhiCase2, /*SizeOnly =*/true) +
+             getInstCodeSize(BrBB, /*SizeOnly =*/true);
+  Test = Visitor.getCodeSizeBonus(F->getArg(2), One);
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0 && Test > 0);
 
   // phi + phi + add + icmp + branch
-  Ref = getInstCost(PhiBB) + getInstCost(PhiLoop) + getInstCost(Add) +
-        getInstCost(Icmp) + getInstCost(Branch);
-  Test = Visitor.getBonusFromPendingPHIs();
+  Ref = getInstCodeSize(PhiBB) + getInstCodeSize(PhiLoop) +
+        getInstCodeSize(Add) + getInstCodeSize(Icmp) + getInstCodeSize(Branch);
+  Test = Visitor.getCodeSizeBonusFromPendingPHIs();
+  EXPECT_EQ(Test, Ref);
+  EXPECT_TRUE(Test > 0);
+
+  // Latency.
+  Ref = getLatency(F);
+  Test = Visitor.getLatencyBonus();
   EXPECT_EQ(Test, Ref);
-  EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
+  EXPECT_TRUE(Test > 0);
 }
 

From fc3aaa44dbcc0e5d78e55816b51dc7437aae85d0 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye@arm.com>
Date: Mon, 21 Oct 2024 15:16:41 +0000
Subject: [PATCH 2/4] Address review comments

---
 .../Transforms/IPO/FunctionSpecialization.h   |  42 +------
 .../Transforms/IPO/FunctionSpecialization.cpp |  62 +++++++---
 .../IPO/FunctionSpecializationTest.cpp        | 117 +++++++++---------
 3 files changed, 109 insertions(+), 112 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
index a50eafe48293a..5920dde9d77df 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
@@ -140,38 +140,6 @@ struct Spec {
       : F(F), Sig(S), Score(Score) {}
 };
 
-struct Bonus {
-  unsigned CodeSize = 0;
-  unsigned Latency = 0;
-
-  Bonus() = default;
-
-  Bonus(Cost CodeSize, Cost Latency) {
-    int64_t Sz = *CodeSize.getValue();
-    int64_t Ltc = *Latency.getValue();
-
-    assert(Sz >= 0 && Ltc >= 0 && "CodeSize and Latency cannot be negative");
-    // It is safe to down cast since we know the arguments
-    // cannot be negative and Cost is of type int64_t.
-    this->CodeSize = static_cast<unsigned>(Sz);
-    this->Latency = static_cast<unsigned>(Ltc);
-  }
-
-  Bonus &operator+=(const Bonus RHS) {
-    CodeSize += RHS.CodeSize;
-    Latency += RHS.Latency;
-    return *this;
-  }
-
-  Bonus operator+(const Bonus RHS) const {
-    return Bonus(CodeSize + RHS.CodeSize, Latency + RHS.Latency);
-  }
-
-  bool operator==(const Bonus RHS) const {
-    return CodeSize == RHS.CodeSize && Latency == RHS.Latency;
-  }
-};
-
 class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   std::function<BlockFrequencyInfo &(Function &)> GetBFI;
   Function *F;
@@ -202,11 +170,11 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
     return Solver.isBlockExecutable(BB) && !DeadBlocks.contains(BB);
   }
 
-  Cost getCodeSizeBonus(Argument *A, Constant *C);
+  Cost getCodeSizeSavingsForArg(Argument *A, Constant *C);
 
-  Cost getCodeSizeBonusFromPendingPHIs();
+  Cost getCodeSizeSavingsFromPendingPHIs();
 
-  Cost getLatencyBonus();
+  Cost getLatencySavingsForKnownConstants();
 
 private:
   friend class InstVisitor<InstCostVisitor, Constant *>;
@@ -214,8 +182,8 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   static bool canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
                                     DenseSet<BasicBlock *> &DeadBlocks);
 
-  Cost getUserCodeSizeBonus(Instruction *User, Value *Use = nullptr,
-                            Constant *C = nullptr);
+  Cost getCodeSizeSavingsForUser(Instruction *User, Value *Use = nullptr,
+                                 Constant *C = nullptr);
 
   Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList);
   Cost estimateSwitchInst(SwitchInst &I);
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 38c10c29cf4e3..04a62837ff962 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -154,33 +154,45 @@ static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) {
   return KnownConstants.lookup(V);
 }
 
-Cost InstCostVisitor::getCodeSizeBonusFromPendingPHIs() {
+Cost InstCostVisitor::getCodeSizeSavingsFromPendingPHIs() {
   Cost CodeSize;
   while (!PendingPHIs.empty()) {
     Instruction *Phi = PendingPHIs.pop_back_val();
     // The pending PHIs could have been proven dead by now.
     if (isBlockExecutable(Phi->getParent()))
-      CodeSize += getUserCodeSizeBonus(Phi);
+      CodeSize += getCodeSizeSavingsForUser(Phi);
   }
   return CodeSize;
 }
 
 /// Compute the codesize savings for replacing argument \p A with constant \p C.
-Cost InstCostVisitor::getCodeSizeBonus(Argument *A, Constant *C) {
+Cost InstCostVisitor::getCodeSizeSavingsForArg(Argument *A, Constant *C) {
   LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
                     << C->getNameOrAsOperand() << "\n");
   Cost CodeSize;
   for (auto *U : A->users())
     if (auto *UI = dyn_cast<Instruction>(U))
       if (isBlockExecutable(UI->getParent()))
-        CodeSize += getUserCodeSizeBonus(UI, A, C);
+        CodeSize += getCodeSizeSavingsForUser(UI, A, C);
 
   LLVM_DEBUG(dbgs() << "FnSpecialization:   Accumulated bonus {CodeSize = "
                     << CodeSize << "} for argument " << *A << "\n");
   return CodeSize;
 }
 
-Cost InstCostVisitor::getLatencyBonus() {
+/// Compute the latency savings from replacing all arguments with constants for
+/// a specialization candidate. As this function computes the latency savings
+/// for all Instructions in KnownConstants at once, it should be called only
+/// after every instruction has been visited, i.e. after:
+///
+/// * getCodeSizeBonus has been run for every constant argument of a
+///   specialization candidate
+///
+/// * getCodeSizeBonusFromPendingPHIs has been run
+///
+/// to ensure that the latency savings are calculated for all Instructions we
+/// have visited and found to be constant.
+Cost InstCostVisitor::getLatencySavingsForKnownConstants() {
   auto &BFI = GetBFI(*F);
   Cost Latency = 0;
 
@@ -198,8 +210,8 @@ Cost InstCostVisitor::getLatencyBonus() {
   return Latency;
 }
 
-Cost InstCostVisitor::getUserCodeSizeBonus(Instruction *User, Value *Use,
-                                           Constant *C) {
+Cost InstCostVisitor::getCodeSizeSavingsForUser(Instruction *User, Value *Use,
+                                                Constant *C) {
   // We have already propagated a constant for this user.
   if (KnownConstants.contains(User))
     return 0;
@@ -232,7 +244,7 @@ Cost InstCostVisitor::getUserCodeSizeBonus(Instruction *User, Value *Use,
   for (auto *U : User->users())
     if (auto *UI = dyn_cast<Instruction>(U))
       if (UI != User && isBlockExecutable(UI->getParent()))
-        CodeSize += getUserCodeSizeBonus(UI, User, C);
+        CodeSize += getCodeSizeSavingsForUser(UI, User, C);
 
   return CodeSize;
 }
@@ -819,6 +831,15 @@ static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) {
   return Clone;
 }
 
+static unsigned getCostValue(const Cost &C) {
+  int64_t Value = *C.getValue();
+
+  assert(Value >= 0 && "CodeSize and Latency cannot be negative");
+  // It is safe to down cast since we know the arguments cannot be negative and
+  // Cost is of type int64_t.
+  return static_cast<unsigned>(Value);
+}
+
 bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
                                               SmallVectorImpl<Spec> &AllSpecs,
                                               SpecMap &SM) {
@@ -889,18 +910,20 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
       unsigned Score = 0;
       InstCostVisitor Visitor = getInstCostVisitorFor(F);
       for (ArgInfo &A : S.Args) {
-        CodeSize += Visitor.getCodeSizeBonus(A.Formal, A.Actual);
+        CodeSize += Visitor.getCodeSizeSavingsForArg(A.Formal, A.Actual);
         Score += getInliningBonus(A.Formal, A.Actual);
       }
-      CodeSize += Visitor.getCodeSizeBonusFromPendingPHIs();
+      CodeSize += Visitor.getCodeSizeSavingsFromPendingPHIs();
 
       LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
                         << CodeSize << ", Inlining = " << Score << "}\n");
 
-      Bonus B = {CodeSize, 0};
-      FunctionGrowth[F] += FuncSize - B.CodeSize;
+      unsigned LatencySavings = 0;
+      unsigned CodeSizeSavings = getCostValue(CodeSize);
+      FunctionGrowth[F] += FuncSize - CodeSizeSavings;
 
-      auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize,
+      auto IsProfitable = [](unsigned CodeSizeSavings, unsigned &LatencySavings,
+                             unsigned Score, unsigned FuncSize,
                              unsigned FuncGrowth, InstCostVisitor &V) -> bool {
         // No check required.
         if (ForceSpecialization)
@@ -909,18 +932,18 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
         if (Score > MinInliningBonus * FuncSize / 100)
           return true;
         // Minimum codesize savings.
-        if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100)
+        if (CodeSizeSavings < MinCodeSizeSavings * FuncSize / 100)
           return false;
 
         // Lazily compute the Latency, to avoid unnecessarily computing BFI.
-        B += {0, V.getLatencyBonus()};
+        LatencySavings = getCostValue(V.getLatencySavingsForKnownConstants());
 
         LLVM_DEBUG(
             dbgs() << "FnSpecialization: Specialization bonus {Latency = "
-                   << B.Latency << "}\n");
+                   << LatencySavings << "}\n");
 
         // Minimum latency savings.
-        if (B.Latency < MinLatencySavings * FuncSize / 100)
+        if (LatencySavings < MinLatencySavings * FuncSize / 100)
           return false;
         // Maximum codesize growth.
         if (FuncGrowth / FuncSize > MaxCodeSizeGrowth)
@@ -929,11 +952,12 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
       };
 
       // Discard unprofitable specialisations.
-      if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F], Visitor))
+      if (!IsProfitable(CodeSizeSavings, LatencySavings, Score, FuncSize,
+                        FunctionGrowth[F], Visitor))
         continue;
 
       // Create a new specialisation entry.
-      Score += std::max(B.CodeSize, B.Latency);
+      Score += std::max(CodeSizeSavings, LatencySavings);
       auto &Spec = AllSpecs.emplace_back(F, S, Score);
       if (CS.getFunction() != F)
         Spec.CallSites.push_back(&CS);
diff --git a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
index 9f2053948cb33..c8fd366bfac65 100644
--- a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
+++ b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
@@ -43,7 +43,7 @@ class FunctionSpecializationTest : public testing::Test {
   FunctionAnalysisManager FAM;
   std::unique_ptr<Module> M;
   std::unique_ptr<SCCPSolver> Solver;
-  SmallVector<Instruction *, 8> Instructions;
+  SmallVector<Instruction *, 8> KnownConstants;
 
   FunctionSpecializationTest() {
     FAM.registerPass([&] { return TargetLibraryAnalysis(); });
@@ -98,24 +98,24 @@ class FunctionSpecializationTest : public testing::Test {
                                GetAC);
   }
 
-  Cost getInstCodeSize(Instruction &I, bool SizeOnly = false) {
+  Cost getCodeSizeSavings(Instruction &I, bool HasLatencySavings = true) {
     auto &TTI = FAM.getResult<TargetIRAnalysis>(*I.getFunction());
 
     Cost CodeSize =
         TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
 
-    if (!SizeOnly)
-      Instructions.push_back(&I);
+    if (HasLatencySavings)
+      KnownConstants.push_back(&I);
 
     return CodeSize;
   }
 
-  Cost getLatency(Function *F) {
+  Cost getLatencySavings(Function *F) {
     auto &TTI = FAM.getResult<TargetIRAnalysis>(*F);
     auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*F);
 
     Cost Latency = 0;
-    for (const Instruction *I : Instructions)
+    for (const Instruction *I : KnownConstants)
       Latency += BFI.getBlockFreq(I->getParent()).getFrequency() /
                  BFI.getEntryFreq().getFrequency() *
                  TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
@@ -180,28 +180,30 @@ TEST_F(FunctionSpecializationTest, SwitchInst) {
   Instruction &BrLoop = BB2.back();
 
   // mul
-  Cost Ref = getInstCodeSize(Mul);
-  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
+  Cost Ref = getCodeSizeSavings(Mul);
+  Cost Test = Visitor.getCodeSizeSavingsForArg(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 
   // and + or + add
-  Ref = getInstCodeSize(And) + getInstCodeSize(Or) + getInstCodeSize(Add);
-  Test = Visitor.getCodeSizeBonus(F->getArg(1), One);
+  Ref = getCodeSizeSavings(And) + getCodeSizeSavings(Or) +
+        getCodeSizeSavings(Add);
+  Test = Visitor.getCodeSizeSavingsForArg(F->getArg(1), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 
   // switch + sdiv + br + br
-  Ref = getInstCodeSize(Switch) + getInstCodeSize(Sdiv, /*SizeOnly =*/true) +
-        getInstCodeSize(BrBB2, /*SizeOnly =*/true) +
-        getInstCodeSize(BrLoop, /*SizeOnly =*/true);
-  Test = Visitor.getCodeSizeBonus(F->getArg(2), One);
+  Ref = getCodeSizeSavings(Switch) +
+        getCodeSizeSavings(Sdiv, /*HasLatencySavings=*/false) +
+        getCodeSizeSavings(BrBB2, /*HasLatencySavings=*/false) +
+        getCodeSizeSavings(BrLoop, /*HasLatencySavings=*/false);
+  Test = Visitor.getCodeSizeSavingsForArg(F->getArg(2), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 
   // Latency.
-  Ref = getLatency(F);
-  Test = Visitor.getLatencyBonus();
+  Ref = getLatencySavings(F);
+  Test = Visitor.getLatencySavingsForKnownConstants();
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 }
@@ -252,29 +254,31 @@ TEST_F(FunctionSpecializationTest, BranchInst) {
   Instruction &BrLoop = BB2.front();
 
   // mul
-  Cost Ref = getInstCodeSize(Mul);
-  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
+  Cost Ref = getCodeSizeSavings(Mul);
+  Cost Test = Visitor.getCodeSizeSavingsForArg(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 
   // add
-  Ref = getInstCodeSize(Add);
-  Test = Visitor.getCodeSizeBonus(F->getArg(1), One);
+  Ref = getCodeSizeSavings(Add);
+  Test = Visitor.getCodeSizeSavingsForArg(F->getArg(1), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 
   // branch + sub + br + sdiv + br
-  Ref = getInstCodeSize(Branch) + getInstCodeSize(Sub, /*SizeOnly =*/true) +
-        getInstCodeSize(BrBB1BB2) + getInstCodeSize(Sdiv, /*SizeOnly =*/true) +
-        getInstCodeSize(BrBB2, /*SizeOnly =*/true) +
-        getInstCodeSize(BrLoop, /*SizeOnly =*/true);
-  Test = Visitor.getCodeSizeBonus(F->getArg(2), False);
+  Ref = getCodeSizeSavings(Branch) +
+        getCodeSizeSavings(Sub, /*HasLatencySavings=*/false) +
+        getCodeSizeSavings(BrBB1BB2) +
+        getCodeSizeSavings(Sdiv, /*HasLatencySavings=*/false) +
+        getCodeSizeSavings(BrBB2, /*HasLatencySavings=*/false) +
+        getCodeSizeSavings(BrLoop, /*HasLatencySavings=*/false);
+  Test = Visitor.getCodeSizeSavingsForArg(F->getArg(2), False);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 
   // Latency.
-  Ref = getLatency(F);
-  Test = Visitor.getLatencyBonus();
+  Ref = getLatencySavings(F);
+  Test = Visitor.getLatencySavingsForKnownConstants();
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 }
@@ -297,20 +301,20 @@ TEST_F(FunctionSpecializationTest, SelectInst) {
   Constant *False = ConstantInt::getFalse(M.getContext());
   Instruction &Select = *F->front().begin();
 
-  Cost RefCodeSize = getInstCodeSize(Select);
-  Cost RefLatency = getLatency(F);
+  Cost RefCodeSize = getCodeSizeSavings(Select);
+  Cost RefLatency = getLatencySavings(F);
 
-  Cost TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(0), False);
+  Cost TestCodeSize = Visitor.getCodeSizeSavingsForArg(F->getArg(0), False);
   EXPECT_TRUE(TestCodeSize == 0);
-  TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(1), One);
+  TestCodeSize = Visitor.getCodeSizeSavingsForArg(F->getArg(1), One);
   EXPECT_TRUE(TestCodeSize == 0);
-  Cost TestLatency = Visitor.getLatencyBonus();
+  Cost TestLatency = Visitor.getLatencySavingsForKnownConstants();
   EXPECT_TRUE(TestLatency == 0);
 
-  TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(2), Zero);
+  TestCodeSize = Visitor.getCodeSizeSavingsForArg(F->getArg(2), Zero);
   EXPECT_EQ(TestCodeSize, RefCodeSize);
   EXPECT_TRUE(TestCodeSize > 0);
-  TestLatency = Visitor.getLatencyBonus();
+  TestLatency = Visitor.getLatencySavingsForKnownConstants();
   EXPECT_EQ(TestLatency, RefLatency);
   EXPECT_TRUE(TestLatency > 0);
 }
@@ -358,30 +362,30 @@ TEST_F(FunctionSpecializationTest, Misc) {
   Instruction &Smax = *BlockIter++;
 
   // icmp + zext
-  Cost Ref = getInstCodeSize(Icmp) + getInstCodeSize(Zext);
-  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
+  Cost Ref = getCodeSizeSavings(Icmp) + getCodeSizeSavings(Zext);
+  Cost Test = Visitor.getCodeSizeSavingsForArg(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 
   // select
-  Ref = getInstCodeSize(Select);
-  Test = Visitor.getCodeSizeBonus(F->getArg(1), True);
+  Ref = getCodeSizeSavings(Select);
+  Test = Visitor.getCodeSizeSavingsForArg(F->getArg(1), True);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 
   // gep + load + freeze + smax
-  Ref = getInstCodeSize(Gep) + getInstCodeSize(Load) + getInstCodeSize(Freeze) +
-        getInstCodeSize(Smax);
-  Test = Visitor.getCodeSizeBonus(F->getArg(2), GV);
+  Ref = getCodeSizeSavings(Gep) + getCodeSizeSavings(Load) +
+        getCodeSizeSavings(Freeze) + getCodeSizeSavings(Smax);
+  Test = Visitor.getCodeSizeSavingsForArg(F->getArg(2), GV);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 
-  Test = Visitor.getCodeSizeBonus(F->getArg(3), Undef);
+  Test = Visitor.getCodeSizeSavingsForArg(F->getArg(3), Undef);
   EXPECT_TRUE(Test == 0);
 
   // Latency.
-  Ref = getLatency(F);
-  Test = Visitor.getLatencyBonus();
+  Ref = getLatencySavings(F);
+  Test = Visitor.getLatencySavingsForKnownConstants();
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 }
@@ -433,33 +437,34 @@ TEST_F(FunctionSpecializationTest, PhiNode) {
   Instruction &Icmp = *++BB.begin();
   Instruction &Branch = BB.back();
 
-  Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One);
+  Cost Test = Visitor.getCodeSizeSavingsForArg(F->getArg(0), One);
   EXPECT_TRUE(Test == 0);
 
-  Test = Visitor.getCodeSizeBonus(F->getArg(1), One);
+  Test = Visitor.getCodeSizeSavingsForArg(F->getArg(1), One);
   EXPECT_TRUE(Test == 0);
 
-  Test = Visitor.getLatencyBonus();
+  Test = Visitor.getLatencySavingsForKnownConstants();
   EXPECT_TRUE(Test == 0);
 
   // switch + phi + br
-  Cost Ref = getInstCodeSize(Switch) +
-             getInstCodeSize(PhiCase2, /*SizeOnly =*/true) +
-             getInstCodeSize(BrBB, /*SizeOnly =*/true);
-  Test = Visitor.getCodeSizeBonus(F->getArg(2), One);
+  Cost Ref = getCodeSizeSavings(Switch) +
+             getCodeSizeSavings(PhiCase2, /*HasLatencySavings=*/false) +
+             getCodeSizeSavings(BrBB, /*HasLatencySavings=*/false);
+  Test = Visitor.getCodeSizeSavingsForArg(F->getArg(2), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0 && Test > 0);
 
   // phi + phi + add + icmp + branch
-  Ref = getInstCodeSize(PhiBB) + getInstCodeSize(PhiLoop) +
-        getInstCodeSize(Add) + getInstCodeSize(Icmp) + getInstCodeSize(Branch);
-  Test = Visitor.getCodeSizeBonusFromPendingPHIs();
+  Ref = getCodeSizeSavings(PhiBB) + getCodeSizeSavings(PhiLoop) +
+        getCodeSizeSavings(Add) + getCodeSizeSavings(Icmp) +
+        getCodeSizeSavings(Branch);
+  Test = Visitor.getCodeSizeSavingsFromPendingPHIs();
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 
   // Latency.
-  Ref = getLatency(F);
-  Test = Visitor.getLatencyBonus();
+  Ref = getLatencySavings(F);
+  Test = Visitor.getLatencySavingsForKnownConstants();
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test > 0);
 }

From b9336f26bc021d91a1fc1d5a2495b11124ab67f5 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye@arm.com>
Date: Mon, 21 Oct 2024 21:49:53 +0000
Subject: [PATCH 3/4] Address review comments 2

- Fix outdated function names in comments.
- Sink debug messages about profitability into the IsProfitable lambda,
  and also add percentage of Function size while we're here.
- Move other profitability related statements into IsProfitable.
- Add a TODO to improve the accumulation of codesize increases
  (FunctionGrowth)
---
 .../Transforms/IPO/FunctionSpecialization.cpp | 48 ++++++++++++-------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 04a62837ff962..35a75515cca22 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -185,10 +185,10 @@ Cost InstCostVisitor::getCodeSizeSavingsForArg(Argument *A, Constant *C) {
 /// for all Instructions in KnownConstants at once, it should be called only
 /// after every instruction has been visited, i.e. after:
 ///
-/// * getCodeSizeBonus has been run for every constant argument of a
+/// * getCodeSizeSavingsForArg has been run for every constant argument of a
 ///   specialization candidate
 ///
-/// * getCodeSizeBonusFromPendingPHIs has been run
+/// * getCodeSizeSavingsFromPendingPHIs has been run
 ///
 /// to ensure that the latency savings are calculated for all Instructions we
 /// have visited and found to be constant.
@@ -831,6 +831,9 @@ static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) {
   return Clone;
 }
 
+/// Get the unsigned Value of given Cost object. Assumes the Cost is always
+/// non-negative, which is true for both TCK_CodeSize and TCK_Latency, and
+/// always Valid.
 static unsigned getCostValue(const Cost &C) {
   int64_t Value = *C.getValue();
 
@@ -915,49 +918,58 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
       }
       CodeSize += Visitor.getCodeSizeSavingsFromPendingPHIs();
 
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
-                        << CodeSize << ", Inlining = " << Score << "}\n");
-
-      unsigned LatencySavings = 0;
-      unsigned CodeSizeSavings = getCostValue(CodeSize);
-      FunctionGrowth[F] += FuncSize - CodeSizeSavings;
-
-      auto IsProfitable = [](unsigned CodeSizeSavings, unsigned &LatencySavings,
-                             unsigned Score, unsigned FuncSize,
-                             unsigned FuncGrowth, InstCostVisitor &V) -> bool {
+      auto IsProfitable = [&]() -> bool {
         // No check required.
         if (ForceSpecialization)
           return true;
+
+        unsigned CodeSizeSavings = getCostValue(CodeSize);
+        // TODO: We should only accumulate codesize increase of specializations
+        // that are actually created.
+        FunctionGrowth[F] += FuncSize - CodeSizeSavings;
+
+        LLVM_DEBUG(
+            dbgs() << "FnSpecialization: Specialization bonus {Inlining = "
+                   << Score << " (" << (Score * 100 / FuncSize) << "%)}\n");
+
         // Minimum inlining bonus.
         if (Score > MinInliningBonus * FuncSize / 100)
           return true;
+
+        LLVM_DEBUG(
+            dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
+                   << CodeSizeSavings << " ("
+                   << (CodeSizeSavings * 100 / FuncSize) << "%)}\n");
+
         // Minimum codesize savings.
         if (CodeSizeSavings < MinCodeSizeSavings * FuncSize / 100)
           return false;
 
         // Lazily compute the Latency, to avoid unnecessarily computing BFI.
-        LatencySavings = getCostValue(V.getLatencySavingsForKnownConstants());
+        unsigned LatencySavings =
+            getCostValue(Visitor.getLatencySavingsForKnownConstants());
 
         LLVM_DEBUG(
             dbgs() << "FnSpecialization: Specialization bonus {Latency = "
-                   << LatencySavings << "}\n");
+                   << LatencySavings << " ("
+                   << (LatencySavings * 100 / FuncSize) << "%)}\n");
 
         // Minimum latency savings.
         if (LatencySavings < MinLatencySavings * FuncSize / 100)
           return false;
         // Maximum codesize growth.
-        if (FuncGrowth / FuncSize > MaxCodeSizeGrowth)
+        if (FunctionGrowth[F] / FuncSize > MaxCodeSizeGrowth)
           return false;
+
+        Score += std::max(CodeSizeSavings, LatencySavings);
         return true;
       };
 
       // Discard unprofitable specialisations.
-      if (!IsProfitable(CodeSizeSavings, LatencySavings, Score, FuncSize,
-                        FunctionGrowth[F], Visitor))
+      if (!IsProfitable())
         continue;
 
       // Create a new specialisation entry.
-      Score += std::max(CodeSizeSavings, LatencySavings);
       auto &Spec = AllSpecs.emplace_back(F, S, Score);
       if (CS.getFunction() != F)
         Spec.CallSites.push_back(&CS);

From 0a5d644a0c877d019a152a5db6d79760b8f117d0 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye@arm.com>
Date: Tue, 22 Oct 2024 14:36:55 +0000
Subject: [PATCH 4/4] Address review 3

- Add debug statement to getLatencySavingsForKnownConstants
---
 llvm/lib/Transforms/IPO/FunctionSpecialization.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 35a75515cca22..20249a20a37e4 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -194,7 +194,7 @@ Cost InstCostVisitor::getCodeSizeSavingsForArg(Argument *A, Constant *C) {
 /// have visited and found to be constant.
 Cost InstCostVisitor::getLatencySavingsForKnownConstants() {
   auto &BFI = GetBFI(*F);
-  Cost Latency = 0;
+  Cost TotalLatency = 0;
 
   for (auto Pair : KnownConstants) {
     Instruction *I = dyn_cast<Instruction>(Pair.first);
@@ -203,11 +203,17 @@ Cost InstCostVisitor::getLatencySavingsForKnownConstants() {
 
     uint64_t Weight = BFI.getBlockFreq(I->getParent()).getFrequency() /
                       BFI.getEntryFreq().getFrequency();
-    Latency +=
+
+    Cost Latency =
         Weight * TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+
+    LLVM_DEBUG(dbgs() << "FnSpecialization:     {Latency = " << Latency
+                      << "} for instruction " << *I << "\n");
+
+    TotalLatency += Latency;
   }
 
-  return Latency;
+  return TotalLatency;
 }
 
 Cost InstCostVisitor::getCodeSizeSavingsForUser(Instruction *User, Value *Use,