From d3823156e7d19413f76c3b1943e3d0d4b31bf09c Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Wed, 16 Oct 2024 23:25:54 +0000 Subject: [PATCH 1/4] [FuncSpec] Only compute Latency bonus when necessary Only compute the Latency component of a specialisation's Bonus when necessary, to avoid unnecessarily computing the Block Frequency Information for a Function. --- .../Transforms/IPO/FunctionSpecialization.h | 23 ++- .../Transforms/IPO/FunctionSpecialization.cpp | 85 +++++---- .../Transforms/SCCP/ipsccp-preserve-pdt.ll | 18 +- .../IPO/FunctionSpecializationTest.cpp | 177 +++++++++++------- 4 files changed, 182 insertions(+), 121 deletions(-) diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h index b001771951e0f..a50eafe48293a 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h @@ -173,8 +173,9 @@ struct Bonus { }; class InstCostVisitor : public InstVisitor { + std::function GetBFI; + Function *F; const DataLayout &DL; - BlockFrequencyInfo &BFI; TargetTransformInfo &TTI; SCCPSolver &Solver; @@ -192,17 +193,20 @@ class InstCostVisitor : public InstVisitor { ConstMap::iterator LastVisited; public: - InstCostVisitor(const DataLayout &DL, BlockFrequencyInfo &BFI, - TargetTransformInfo &TTI, SCCPSolver &Solver) - : DL(DL), BFI(BFI), TTI(TTI), Solver(Solver) {} + InstCostVisitor(std::function GetBFI, + Function *F, const DataLayout &DL, TargetTransformInfo &TTI, + SCCPSolver &Solver) + : GetBFI(GetBFI), F(F), DL(DL), TTI(TTI), Solver(Solver) {} bool isBlockExecutable(BasicBlock *BB) { return Solver.isBlockExecutable(BB) && !DeadBlocks.contains(BB); } - Bonus getSpecializationBonus(Argument *A, Constant *C); + Cost getCodeSizeBonus(Argument *A, Constant *C); + + Cost getCodeSizeBonusFromPendingPHIs(); - Bonus getBonusFromPendingPHIs(); + Cost getLatencyBonus(); private: friend class InstVisitor; @@ -210,8 +214,8 @@ class InstCostVisitor : public InstVisitor { static bool canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ, DenseSet &DeadBlocks); - Bonus getUserBonus(Instruction *User, Value *Use = nullptr, - Constant *C = nullptr); + Cost getUserCodeSizeBonus(Instruction *User, Value *Use = nullptr, + Constant *C = nullptr); Cost estimateBasicBlocks(SmallVectorImpl &WorkList); Cost estimateSwitchInst(SwitchInst &I); @@ -283,9 +287,8 @@ class FunctionSpecializer { bool run(); InstCostVisitor getInstCostVisitorFor(Function *F) { - auto &BFI = GetBFI(*F); auto &TTI = GetTTI(*F); - return InstCostVisitor(M.getDataLayout(), BFI, TTI, Solver); + return InstCostVisitor(GetBFI, F, M.getDataLayout(), TTI, Solver); } private: diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 7feebbe420ae5..38c10c29cf4e3 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -112,7 +112,7 @@ bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ, Cost InstCostVisitor::estimateBasicBlocks( SmallVectorImpl &WorkList) { Cost CodeSize = 0; - // Accumulate the instruction cost of each basic block weighted by frequency. + // Accumulate the codesize savings of each basic block. while (!WorkList.empty()) { BasicBlock *BB = WorkList.pop_back_val(); @@ -154,37 +154,55 @@ static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) { return KnownConstants.lookup(V); } -Bonus InstCostVisitor::getBonusFromPendingPHIs() { - Bonus B; +Cost InstCostVisitor::getCodeSizeBonusFromPendingPHIs() { + Cost CodeSize; while (!PendingPHIs.empty()) { Instruction *Phi = PendingPHIs.pop_back_val(); // The pending PHIs could have been proven dead by now. if (isBlockExecutable(Phi->getParent())) - B += getUserBonus(Phi); + CodeSize += getUserCodeSizeBonus(Phi); } - return B; + return CodeSize; } -/// Compute a bonus for replacing argument \p A with constant \p C. -Bonus InstCostVisitor::getSpecializationBonus(Argument *A, Constant *C) { +/// Compute the codesize savings for replacing argument \p A with constant \p C. +Cost InstCostVisitor::getCodeSizeBonus(Argument *A, Constant *C) { LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: " << C->getNameOrAsOperand() << "\n"); - Bonus B; + Cost CodeSize; for (auto *U : A->users()) if (auto *UI = dyn_cast(U)) if (isBlockExecutable(UI->getParent())) - B += getUserBonus(UI, A, C); + CodeSize += getUserCodeSizeBonus(UI, A, C); LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated bonus {CodeSize = " - << B.CodeSize << ", Latency = " << B.Latency - << "} for argument " << *A << "\n"); - return B; + << CodeSize << "} for argument " << *A << "\n"); + return CodeSize; +} + +Cost InstCostVisitor::getLatencyBonus() { + auto &BFI = GetBFI(*F); + Cost Latency = 0; + + for (auto Pair : KnownConstants) { + Instruction *I = dyn_cast(Pair.first); + if (!I) + continue; + + uint64_t Weight = BFI.getBlockFreq(I->getParent()).getFrequency() / + BFI.getEntryFreq().getFrequency(); + Latency += + Weight * TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency); + } + + return Latency; } -Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) { +Cost InstCostVisitor::getUserCodeSizeBonus(Instruction *User, Value *Use, + Constant *C) { // We have already propagated a constant for this user. if (KnownConstants.contains(User)) - return {0, 0}; + return 0; // Cache the iterator before visiting. LastVisited = Use ? KnownConstants.insert({Use, C}).first @@ -198,7 +216,7 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) } else { C = visit(*User); if (!C) - return {0, 0}; + return 0; } // Even though it doesn't make sense to bind switch and branch instructions @@ -208,23 +226,15 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) CodeSize += TTI.getInstructionCost(User, TargetTransformInfo::TCK_CodeSize); - uint64_t Weight = BFI.getBlockFreq(User->getParent()).getFrequency() / - BFI.getEntryFreq().getFrequency(); - - Cost Latency = Weight * - TTI.getInstructionCost(User, TargetTransformInfo::TCK_Latency); - LLVM_DEBUG(dbgs() << "FnSpecialization: {CodeSize = " << CodeSize - << ", Latency = " << Latency << "} for user " - << *User << "\n"); + << "} for user " << *User << "\n"); - Bonus B(CodeSize, Latency); for (auto *U : User->users()) if (auto *UI = dyn_cast(U)) if (UI != User && isBlockExecutable(UI->getParent())) - B += getUserBonus(UI, User, C); + CodeSize += getUserCodeSizeBonus(UI, User, C); - return B; + return CodeSize; } Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) { @@ -875,24 +885,23 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, AllSpecs[Index].CallSites.push_back(&CS); } else { // Calculate the specialisation gain. - Bonus B; + Cost CodeSize; unsigned Score = 0; InstCostVisitor Visitor = getInstCostVisitorFor(F); for (ArgInfo &A : S.Args) { - B += Visitor.getSpecializationBonus(A.Formal, A.Actual); + CodeSize += Visitor.getCodeSizeBonus(A.Formal, A.Actual); Score += getInliningBonus(A.Formal, A.Actual); } - B += Visitor.getBonusFromPendingPHIs(); - + CodeSize += Visitor.getCodeSizeBonusFromPendingPHIs(); LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = " - << B.CodeSize << ", Latency = " << B.Latency - << ", Inlining = " << Score << "}\n"); + << CodeSize << ", Inlining = " << Score << "}\n"); + Bonus B = {CodeSize, 0}; FunctionGrowth[F] += FuncSize - B.CodeSize; auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize, - unsigned FuncGrowth) -> bool { + unsigned FuncGrowth, InstCostVisitor &V) -> bool { // No check required. if (ForceSpecialization) return true; @@ -902,6 +911,14 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, // Minimum codesize savings. if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100) return false; + + // Lazily compute the Latency, to avoid unnecessarily computing BFI. + B += {0, V.getLatencyBonus()}; + + LLVM_DEBUG( + dbgs() << "FnSpecialization: Specialization bonus {Latency = " + << B.Latency << "}\n"); + // Minimum latency savings. if (B.Latency < MinLatencySavings * FuncSize / 100) return false; @@ -912,7 +929,7 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, }; // Discard unprofitable specialisations. - if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F])) + if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F], Visitor)) continue; // Create a new specialisation entry. diff --git a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll index ff57569d12788..f8c8e33dfc233 100644 --- a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll +++ b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll @@ -4,25 +4,25 @@ ; This test case is trying to validate that the postdomtree is preserved ; correctly by the ipsccp pass. A tricky bug was introduced in commit -; 1b1232047e83b69561 when PDT would be feched using getCachedAnalysis in order +; 1b1232047e83b69561 when PDT would be fetched using getCachedAnalysis in order ; to setup a DomTreeUpdater (to update the PDT during transformation in order ; to preserve the analysis). But given that commit the PDT could end up being ; required and calculated via BlockFrequency analysis. So the problem was that ; when setting up the DomTreeUpdater we used a nullptr in case PDT wasn't -; cached at the begininng of IPSCCP, to indicate that no updates where needed +; cached at the beginning of IPSCCP, to indicate that no updates were needed ; for PDT. But then the PDT was calculated, given the input IR, and preserved ; using the non-updated state (as the DTU wasn't configured for updating the ; PDT). ; CHECK-NOT: ; CHECK: Inorder PostDominator Tree: DFSNumbers invalid: 0 slow queries. -; CHECK-NEXT: [1] <> {4294967295,4294967295} [0] -; CHECK-NEXT: [2] %for.cond34 {4294967295,4294967295} [1] -; CHECK-NEXT: [3] %for.cond16 {4294967295,4294967295} [2] -; CHECK-NEXT: [2] %for.body {4294967295,4294967295} [1] -; CHECK-NEXT: [2] %if.end4 {4294967295,4294967295} [1] -; CHECK-NEXT: [3] %entry {4294967295,4294967295} [2] -; CHECK-NEXT: Roots: %for.cond34 %for.body +; CHECK-NEXT: [1] <> {4294967295,4294967295} [0] +; CHECK-NEXT: [2] %for.body {4294967295,4294967295} [1] +; CHECK-NEXT: [2] %if.end4 {4294967295,4294967295} [1] +; CHECK-NEXT: [3] %entry {4294967295,4294967295} [2] +; CHECK-NEXT: [2] %for.cond34 {4294967295,4294967295} [1] +; CHECK-NEXT: [3] %for.cond16 {4294967295,4294967295} [2] +; CHECK-NEXT: Roots: %for.body %for.cond34 ; CHECK-NEXT: PostDominatorTree for function: bar ; CHECK-NOT: diff --git a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp index b0ff55489e176..9f2053948cb33 100644 --- a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp +++ b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp @@ -43,6 +43,7 @@ class FunctionSpecializationTest : public testing::Test { FunctionAnalysisManager FAM; std::unique_ptr M; std::unique_ptr Solver; + SmallVector Instructions; FunctionSpecializationTest() { FAM.registerPass([&] { return TargetLibraryAnalysis(); }); @@ -97,21 +98,29 @@ class FunctionSpecializationTest : public testing::Test { GetAC); } - Bonus getInstCost(Instruction &I, bool SizeOnly = false) { + Cost getInstCodeSize(Instruction &I, bool SizeOnly = false) { auto &TTI = FAM.getResult(*I.getFunction()); - auto &BFI = FAM.getResult(*I.getFunction()); Cost CodeSize = TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); - Cost Latency = - SizeOnly - ? 0 - : BFI.getBlockFreq(I.getParent()).getFrequency() / - BFI.getEntryFreq().getFrequency() * - TTI.getInstructionCost(&I, TargetTransformInfo::TCK_Latency); + if (!SizeOnly) + Instructions.push_back(&I); - return {CodeSize, Latency}; + return CodeSize; + } + + Cost getLatency(Function *F) { + auto &TTI = FAM.getResult(*F); + auto &BFI = FAM.getResult(*F); + + Cost Latency = 0; + for (const Instruction *I : Instructions) + Latency += BFI.getBlockFreq(I->getParent()).getFrequency() / + BFI.getEntryFreq().getFrequency() * + TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency); + + return Latency; } }; @@ -171,25 +180,30 @@ TEST_F(FunctionSpecializationTest, SwitchInst) { Instruction &BrLoop = BB2.back(); // mul - Bonus Ref = getInstCost(Mul); - Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One); + Cost Ref = getInstCodeSize(Mul); + Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One); EXPECT_EQ(Test, Ref); - EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0); + EXPECT_TRUE(Test > 0); // and + or + add - Ref = getInstCost(And) + getInstCost(Or) + getInstCost(Add); - Test = Visitor.getSpecializationBonus(F->getArg(1), One); + Ref = getInstCodeSize(And) + getInstCodeSize(Or) + getInstCodeSize(Add); + Test = Visitor.getCodeSizeBonus(F->getArg(1), One); EXPECT_EQ(Test, Ref); - EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0); + EXPECT_TRUE(Test > 0); // switch + sdiv + br + br - Ref = getInstCost(Switch) + - getInstCost(Sdiv, /*SizeOnly =*/ true) + - getInstCost(BrBB2, /*SizeOnly =*/ true) + - getInstCost(BrLoop, /*SizeOnly =*/ true); - Test = Visitor.getSpecializationBonus(F->getArg(2), One); + Ref = getInstCodeSize(Switch) + getInstCodeSize(Sdiv, /*SizeOnly =*/true) + + getInstCodeSize(BrBB2, /*SizeOnly =*/true) + + getInstCodeSize(BrLoop, /*SizeOnly =*/true); + Test = Visitor.getCodeSizeBonus(F->getArg(2), One); EXPECT_EQ(Test, Ref); - EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0); + EXPECT_TRUE(Test > 0); + + // Latency. + Ref = getLatency(F); + Test = Visitor.getLatencyBonus(); + EXPECT_EQ(Test, Ref); + EXPECT_TRUE(Test > 0); } TEST_F(FunctionSpecializationTest, BranchInst) { @@ -238,27 +252,31 @@ TEST_F(FunctionSpecializationTest, BranchInst) { Instruction &BrLoop = BB2.front(); // mul - Bonus Ref = getInstCost(Mul); - Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One); + Cost Ref = getInstCodeSize(Mul); + Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One); EXPECT_EQ(Test, Ref); - EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0); + EXPECT_TRUE(Test > 0); // add - Ref = getInstCost(Add); - Test = Visitor.getSpecializationBonus(F->getArg(1), One); + Ref = getInstCodeSize(Add); + Test = Visitor.getCodeSizeBonus(F->getArg(1), One); EXPECT_EQ(Test, Ref); - EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0); + EXPECT_TRUE(Test > 0); // branch + sub + br + sdiv + br - Ref = getInstCost(Branch) + - getInstCost(Sub, /*SizeOnly =*/ true) + - getInstCost(BrBB1BB2) + - getInstCost(Sdiv, /*SizeOnly =*/ true) + - getInstCost(BrBB2, /*SizeOnly =*/ true) + - getInstCost(BrLoop, /*SizeOnly =*/ true); - Test = Visitor.getSpecializationBonus(F->getArg(2), False); + Ref = getInstCodeSize(Branch) + getInstCodeSize(Sub, /*SizeOnly =*/true) + + getInstCodeSize(BrBB1BB2) + getInstCodeSize(Sdiv, /*SizeOnly =*/true) + + getInstCodeSize(BrBB2, /*SizeOnly =*/true) + + getInstCodeSize(BrLoop, /*SizeOnly =*/true); + Test = Visitor.getCodeSizeBonus(F->getArg(2), False); + EXPECT_EQ(Test, Ref); + EXPECT_TRUE(Test > 0); + + // Latency. + Ref = getLatency(F); + Test = Visitor.getLatencyBonus(); EXPECT_EQ(Test, Ref); - EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0); + EXPECT_TRUE(Test > 0); } TEST_F(FunctionSpecializationTest, SelectInst) { @@ -279,14 +297,22 @@ TEST_F(FunctionSpecializationTest, SelectInst) { Constant *False = ConstantInt::getFalse(M.getContext()); Instruction &Select = *F->front().begin(); - Bonus Ref = getInstCost(Select); - Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), False); - EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0); - Test = Visitor.getSpecializationBonus(F->getArg(1), One); - EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0); - Test = Visitor.getSpecializationBonus(F->getArg(2), Zero); - EXPECT_EQ(Test, Ref); - EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0); + Cost RefCodeSize = getInstCodeSize(Select); + Cost RefLatency = getLatency(F); + + Cost TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(0), False); + EXPECT_TRUE(TestCodeSize == 0); + TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(1), One); + EXPECT_TRUE(TestCodeSize == 0); + Cost TestLatency = Visitor.getLatencyBonus(); + EXPECT_TRUE(TestLatency == 0); + + TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(2), Zero); + EXPECT_EQ(TestCodeSize, RefCodeSize); + EXPECT_TRUE(TestCodeSize > 0); + TestLatency = Visitor.getLatencyBonus(); + EXPECT_EQ(TestLatency, RefLatency); + EXPECT_TRUE(TestLatency > 0); } TEST_F(FunctionSpecializationTest, Misc) { @@ -332,26 +358,32 @@ TEST_F(FunctionSpecializationTest, Misc) { Instruction &Smax = *BlockIter++; // icmp + zext - Bonus Ref = getInstCost(Icmp) + getInstCost(Zext); - Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One); + Cost Ref = getInstCodeSize(Icmp) + getInstCodeSize(Zext); + Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One); EXPECT_EQ(Test, Ref); - EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0); + EXPECT_TRUE(Test > 0); // select - Ref = getInstCost(Select); - Test = Visitor.getSpecializationBonus(F->getArg(1), True); + Ref = getInstCodeSize(Select); + Test = Visitor.getCodeSizeBonus(F->getArg(1), True); EXPECT_EQ(Test, Ref); - EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0); + EXPECT_TRUE(Test > 0); // gep + load + freeze + smax - Ref = getInstCost(Gep) + getInstCost(Load) + getInstCost(Freeze) + - getInstCost(Smax); - Test = Visitor.getSpecializationBonus(F->getArg(2), GV); + Ref = getInstCodeSize(Gep) + getInstCodeSize(Load) + getInstCodeSize(Freeze) + + getInstCodeSize(Smax); + Test = Visitor.getCodeSizeBonus(F->getArg(2), GV); EXPECT_EQ(Test, Ref); - EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0); + EXPECT_TRUE(Test > 0); - Test = Visitor.getSpecializationBonus(F->getArg(3), Undef); - EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0); + Test = Visitor.getCodeSizeBonus(F->getArg(3), Undef); + EXPECT_TRUE(Test == 0); + + // Latency. + Ref = getLatency(F); + Test = Visitor.getLatencyBonus(); + EXPECT_EQ(Test, Ref); + EXPECT_TRUE(Test > 0); } TEST_F(FunctionSpecializationTest, PhiNode) { @@ -401,25 +433,34 @@ TEST_F(FunctionSpecializationTest, PhiNode) { Instruction &Icmp = *++BB.begin(); Instruction &Branch = BB.back(); - Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One); - EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0); + Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One); + EXPECT_TRUE(Test == 0); - Test = Visitor.getSpecializationBonus(F->getArg(1), One); - EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0); + Test = Visitor.getCodeSizeBonus(F->getArg(1), One); + EXPECT_TRUE(Test == 0); + + Test = Visitor.getLatencyBonus(); + EXPECT_TRUE(Test == 0); // switch + phi + br - Bonus Ref = getInstCost(Switch) + - getInstCost(PhiCase2, /*SizeOnly =*/ true) + - getInstCost(BrBB, /*SizeOnly =*/ true); - Test = Visitor.getSpecializationBonus(F->getArg(2), One); + Cost Ref = getInstCodeSize(Switch) + + getInstCodeSize(PhiCase2, /*SizeOnly =*/true) + + getInstCodeSize(BrBB, /*SizeOnly =*/true); + Test = Visitor.getCodeSizeBonus(F->getArg(2), One); EXPECT_EQ(Test, Ref); - EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0); + EXPECT_TRUE(Test > 0 && Test > 0); // phi + phi + add + icmp + branch - Ref = getInstCost(PhiBB) + getInstCost(PhiLoop) + getInstCost(Add) + - getInstCost(Icmp) + getInstCost(Branch); - Test = Visitor.getBonusFromPendingPHIs(); + Ref = getInstCodeSize(PhiBB) + getInstCodeSize(PhiLoop) + + getInstCodeSize(Add) + getInstCodeSize(Icmp) + getInstCodeSize(Branch); + Test = Visitor.getCodeSizeBonusFromPendingPHIs(); + EXPECT_EQ(Test, Ref); + EXPECT_TRUE(Test > 0); + + // Latency. + Ref = getLatency(F); + Test = Visitor.getLatencyBonus(); EXPECT_EQ(Test, Ref); - EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0); + EXPECT_TRUE(Test > 0); } From fc3aaa44dbcc0e5d78e55816b51dc7437aae85d0 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Mon, 21 Oct 2024 15:16:41 +0000 Subject: [PATCH 2/4] Address review comments --- .../Transforms/IPO/FunctionSpecialization.h | 42 +------ .../Transforms/IPO/FunctionSpecialization.cpp | 62 +++++++--- .../IPO/FunctionSpecializationTest.cpp | 117 +++++++++--------- 3 files changed, 109 insertions(+), 112 deletions(-) diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h index a50eafe48293a..5920dde9d77df 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h @@ -140,38 +140,6 @@ struct Spec { : F(F), Sig(S), Score(Score) {} }; -struct Bonus { - unsigned CodeSize = 0; - unsigned Latency = 0; - - Bonus() = default; - - Bonus(Cost CodeSize, Cost Latency) { - int64_t Sz = *CodeSize.getValue(); - int64_t Ltc = *Latency.getValue(); - - assert(Sz >= 0 && Ltc >= 0 && "CodeSize and Latency cannot be negative"); - // It is safe to down cast since we know the arguments - // cannot be negative and Cost is of type int64_t. - this->CodeSize = static_cast(Sz); - this->Latency = static_cast(Ltc); - } - - Bonus &operator+=(const Bonus RHS) { - CodeSize += RHS.CodeSize; - Latency += RHS.Latency; - return *this; - } - - Bonus operator+(const Bonus RHS) const { - return Bonus(CodeSize + RHS.CodeSize, Latency + RHS.Latency); - } - - bool operator==(const Bonus RHS) const { - return CodeSize == RHS.CodeSize && Latency == RHS.Latency; - } -}; - class InstCostVisitor : public InstVisitor { std::function GetBFI; Function *F; @@ -202,11 +170,11 @@ class InstCostVisitor : public InstVisitor { return Solver.isBlockExecutable(BB) && !DeadBlocks.contains(BB); } - Cost getCodeSizeBonus(Argument *A, Constant *C); + Cost getCodeSizeSavingsForArg(Argument *A, Constant *C); - Cost getCodeSizeBonusFromPendingPHIs(); + Cost getCodeSizeSavingsFromPendingPHIs(); - Cost getLatencyBonus(); + Cost getLatencySavingsForKnownConstants(); private: friend class InstVisitor; @@ -214,8 +182,8 @@ class InstCostVisitor : public InstVisitor { static bool canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ, DenseSet &DeadBlocks); - Cost getUserCodeSizeBonus(Instruction *User, Value *Use = nullptr, - Constant *C = nullptr); + Cost getCodeSizeSavingsForUser(Instruction *User, Value *Use = nullptr, + Constant *C = nullptr); Cost estimateBasicBlocks(SmallVectorImpl &WorkList); Cost estimateSwitchInst(SwitchInst &I); diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 38c10c29cf4e3..04a62837ff962 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -154,33 +154,45 @@ static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) { return KnownConstants.lookup(V); } -Cost InstCostVisitor::getCodeSizeBonusFromPendingPHIs() { +Cost InstCostVisitor::getCodeSizeSavingsFromPendingPHIs() { Cost CodeSize; while (!PendingPHIs.empty()) { Instruction *Phi = PendingPHIs.pop_back_val(); // The pending PHIs could have been proven dead by now. if (isBlockExecutable(Phi->getParent())) - CodeSize += getUserCodeSizeBonus(Phi); + CodeSize += getCodeSizeSavingsForUser(Phi); } return CodeSize; } /// Compute the codesize savings for replacing argument \p A with constant \p C. -Cost InstCostVisitor::getCodeSizeBonus(Argument *A, Constant *C) { +Cost InstCostVisitor::getCodeSizeSavingsForArg(Argument *A, Constant *C) { LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: " << C->getNameOrAsOperand() << "\n"); Cost CodeSize; for (auto *U : A->users()) if (auto *UI = dyn_cast(U)) if (isBlockExecutable(UI->getParent())) - CodeSize += getUserCodeSizeBonus(UI, A, C); + CodeSize += getCodeSizeSavingsForUser(UI, A, C); LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated bonus {CodeSize = " << CodeSize << "} for argument " << *A << "\n"); return CodeSize; } -Cost InstCostVisitor::getLatencyBonus() { +/// Compute the latency savings from replacing all arguments with constants for +/// a specialization candidate. As this function computes the latency savings +/// for all Instructions in KnownConstants at once, it should be called only +/// after every instruction has been visited, i.e. after: +/// +/// * getCodeSizeBonus has been run for every constant argument of a +/// specialization candidate +/// +/// * getCodeSizeBonusFromPendingPHIs has been run +/// +/// to ensure that the latency savings are calculated for all Instructions we +/// have visited and found to be constant. +Cost InstCostVisitor::getLatencySavingsForKnownConstants() { auto &BFI = GetBFI(*F); Cost Latency = 0; @@ -198,8 +210,8 @@ Cost InstCostVisitor::getLatencyBonus() { return Latency; } -Cost InstCostVisitor::getUserCodeSizeBonus(Instruction *User, Value *Use, - Constant *C) { +Cost InstCostVisitor::getCodeSizeSavingsForUser(Instruction *User, Value *Use, + Constant *C) { // We have already propagated a constant for this user. if (KnownConstants.contains(User)) return 0; @@ -232,7 +244,7 @@ Cost InstCostVisitor::getUserCodeSizeBonus(Instruction *User, Value *Use, for (auto *U : User->users()) if (auto *UI = dyn_cast(U)) if (UI != User && isBlockExecutable(UI->getParent())) - CodeSize += getUserCodeSizeBonus(UI, User, C); + CodeSize += getCodeSizeSavingsForUser(UI, User, C); return CodeSize; } @@ -819,6 +831,15 @@ static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) { return Clone; } +static unsigned getCostValue(const Cost &C) { + int64_t Value = *C.getValue(); + + assert(Value >= 0 && "CodeSize and Latency cannot be negative"); + // It is safe to down cast since we know the arguments cannot be negative and + // Cost is of type int64_t. + return static_cast(Value); +} + bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, SmallVectorImpl &AllSpecs, SpecMap &SM) { @@ -889,18 +910,20 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, unsigned Score = 0; InstCostVisitor Visitor = getInstCostVisitorFor(F); for (ArgInfo &A : S.Args) { - CodeSize += Visitor.getCodeSizeBonus(A.Formal, A.Actual); + CodeSize += Visitor.getCodeSizeSavingsForArg(A.Formal, A.Actual); Score += getInliningBonus(A.Formal, A.Actual); } - CodeSize += Visitor.getCodeSizeBonusFromPendingPHIs(); + CodeSize += Visitor.getCodeSizeSavingsFromPendingPHIs(); LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = " << CodeSize << ", Inlining = " << Score << "}\n"); - Bonus B = {CodeSize, 0}; - FunctionGrowth[F] += FuncSize - B.CodeSize; + unsigned LatencySavings = 0; + unsigned CodeSizeSavings = getCostValue(CodeSize); + FunctionGrowth[F] += FuncSize - CodeSizeSavings; - auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize, + auto IsProfitable = [](unsigned CodeSizeSavings, unsigned &LatencySavings, + unsigned Score, unsigned FuncSize, unsigned FuncGrowth, InstCostVisitor &V) -> bool { // No check required. if (ForceSpecialization) @@ -909,18 +932,18 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, if (Score > MinInliningBonus * FuncSize / 100) return true; // Minimum codesize savings. - if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100) + if (CodeSizeSavings < MinCodeSizeSavings * FuncSize / 100) return false; // Lazily compute the Latency, to avoid unnecessarily computing BFI. - B += {0, V.getLatencyBonus()}; + LatencySavings = getCostValue(V.getLatencySavingsForKnownConstants()); LLVM_DEBUG( dbgs() << "FnSpecialization: Specialization bonus {Latency = " - << B.Latency << "}\n"); + << LatencySavings << "}\n"); // Minimum latency savings. - if (B.Latency < MinLatencySavings * FuncSize / 100) + if (LatencySavings < MinLatencySavings * FuncSize / 100) return false; // Maximum codesize growth. if (FuncGrowth / FuncSize > MaxCodeSizeGrowth) @@ -929,11 +952,12 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, }; // Discard unprofitable specialisations. - if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F], Visitor)) + if (!IsProfitable(CodeSizeSavings, LatencySavings, Score, FuncSize, + FunctionGrowth[F], Visitor)) continue; // Create a new specialisation entry. - Score += std::max(B.CodeSize, B.Latency); + Score += std::max(CodeSizeSavings, LatencySavings); auto &Spec = AllSpecs.emplace_back(F, S, Score); if (CS.getFunction() != F) Spec.CallSites.push_back(&CS); diff --git a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp index 9f2053948cb33..c8fd366bfac65 100644 --- a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp +++ b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp @@ -43,7 +43,7 @@ class FunctionSpecializationTest : public testing::Test { FunctionAnalysisManager FAM; std::unique_ptr M; std::unique_ptr Solver; - SmallVector Instructions; + SmallVector KnownConstants; FunctionSpecializationTest() { FAM.registerPass([&] { return TargetLibraryAnalysis(); }); @@ -98,24 +98,24 @@ class FunctionSpecializationTest : public testing::Test { GetAC); } - Cost getInstCodeSize(Instruction &I, bool SizeOnly = false) { + Cost getCodeSizeSavings(Instruction &I, bool HasLatencySavings = true) { auto &TTI = FAM.getResult(*I.getFunction()); Cost CodeSize = TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); - if (!SizeOnly) - Instructions.push_back(&I); + if (HasLatencySavings) + KnownConstants.push_back(&I); return CodeSize; } - Cost getLatency(Function *F) { + Cost getLatencySavings(Function *F) { auto &TTI = FAM.getResult(*F); auto &BFI = FAM.getResult(*F); Cost Latency = 0; - for (const Instruction *I : Instructions) + for (const Instruction *I : KnownConstants) Latency += BFI.getBlockFreq(I->getParent()).getFrequency() / BFI.getEntryFreq().getFrequency() * TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency); @@ -180,28 +180,30 @@ TEST_F(FunctionSpecializationTest, SwitchInst) { Instruction &BrLoop = BB2.back(); // mul - Cost Ref = getInstCodeSize(Mul); - Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One); + Cost Ref = getCodeSizeSavings(Mul); + Cost Test = Visitor.getCodeSizeSavingsForArg(F->getArg(0), One); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); // and + or + add - Ref = getInstCodeSize(And) + getInstCodeSize(Or) + getInstCodeSize(Add); - Test = Visitor.getCodeSizeBonus(F->getArg(1), One); + Ref = getCodeSizeSavings(And) + getCodeSizeSavings(Or) + + getCodeSizeSavings(Add); + Test = Visitor.getCodeSizeSavingsForArg(F->getArg(1), One); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); // switch + sdiv + br + br - Ref = getInstCodeSize(Switch) + getInstCodeSize(Sdiv, /*SizeOnly =*/true) + - getInstCodeSize(BrBB2, /*SizeOnly =*/true) + - getInstCodeSize(BrLoop, /*SizeOnly =*/true); - Test = Visitor.getCodeSizeBonus(F->getArg(2), One); + Ref = getCodeSizeSavings(Switch) + + getCodeSizeSavings(Sdiv, /*HasLatencySavings=*/false) + + getCodeSizeSavings(BrBB2, /*HasLatencySavings=*/false) + + getCodeSizeSavings(BrLoop, /*HasLatencySavings=*/false); + Test = Visitor.getCodeSizeSavingsForArg(F->getArg(2), One); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); // Latency. - Ref = getLatency(F); - Test = Visitor.getLatencyBonus(); + Ref = getLatencySavings(F); + Test = Visitor.getLatencySavingsForKnownConstants(); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); } @@ -252,29 +254,31 @@ TEST_F(FunctionSpecializationTest, BranchInst) { Instruction &BrLoop = BB2.front(); // mul - Cost Ref = getInstCodeSize(Mul); - Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One); + Cost Ref = getCodeSizeSavings(Mul); + Cost Test = Visitor.getCodeSizeSavingsForArg(F->getArg(0), One); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); // add - Ref = getInstCodeSize(Add); - Test = Visitor.getCodeSizeBonus(F->getArg(1), One); + Ref = getCodeSizeSavings(Add); + Test = Visitor.getCodeSizeSavingsForArg(F->getArg(1), One); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); // branch + sub + br + sdiv + br - Ref = getInstCodeSize(Branch) + getInstCodeSize(Sub, /*SizeOnly =*/true) + - getInstCodeSize(BrBB1BB2) + getInstCodeSize(Sdiv, /*SizeOnly =*/true) + - getInstCodeSize(BrBB2, /*SizeOnly =*/true) + - getInstCodeSize(BrLoop, /*SizeOnly =*/true); - Test = Visitor.getCodeSizeBonus(F->getArg(2), False); + Ref = getCodeSizeSavings(Branch) + + getCodeSizeSavings(Sub, /*HasLatencySavings=*/false) + + getCodeSizeSavings(BrBB1BB2) + + getCodeSizeSavings(Sdiv, /*HasLatencySavings=*/false) + + getCodeSizeSavings(BrBB2, /*HasLatencySavings=*/false) + + getCodeSizeSavings(BrLoop, /*HasLatencySavings=*/false); + Test = Visitor.getCodeSizeSavingsForArg(F->getArg(2), False); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); // Latency. - Ref = getLatency(F); - Test = Visitor.getLatencyBonus(); + Ref = getLatencySavings(F); + Test = Visitor.getLatencySavingsForKnownConstants(); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); } @@ -297,20 +301,20 @@ TEST_F(FunctionSpecializationTest, SelectInst) { Constant *False = ConstantInt::getFalse(M.getContext()); Instruction &Select = *F->front().begin(); - Cost RefCodeSize = getInstCodeSize(Select); - Cost RefLatency = getLatency(F); + Cost RefCodeSize = getCodeSizeSavings(Select); + Cost RefLatency = getLatencySavings(F); - Cost TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(0), False); + Cost TestCodeSize = Visitor.getCodeSizeSavingsForArg(F->getArg(0), False); EXPECT_TRUE(TestCodeSize == 0); - TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(1), One); + TestCodeSize = Visitor.getCodeSizeSavingsForArg(F->getArg(1), One); EXPECT_TRUE(TestCodeSize == 0); - Cost TestLatency = Visitor.getLatencyBonus(); + Cost TestLatency = Visitor.getLatencySavingsForKnownConstants(); EXPECT_TRUE(TestLatency == 0); - TestCodeSize = Visitor.getCodeSizeBonus(F->getArg(2), Zero); + TestCodeSize = Visitor.getCodeSizeSavingsForArg(F->getArg(2), Zero); EXPECT_EQ(TestCodeSize, RefCodeSize); EXPECT_TRUE(TestCodeSize > 0); - TestLatency = Visitor.getLatencyBonus(); + TestLatency = Visitor.getLatencySavingsForKnownConstants(); EXPECT_EQ(TestLatency, RefLatency); EXPECT_TRUE(TestLatency > 0); } @@ -358,30 +362,30 @@ TEST_F(FunctionSpecializationTest, Misc) { Instruction &Smax = *BlockIter++; // icmp + zext - Cost Ref = getInstCodeSize(Icmp) + getInstCodeSize(Zext); - Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One); + Cost Ref = getCodeSizeSavings(Icmp) + getCodeSizeSavings(Zext); + Cost Test = Visitor.getCodeSizeSavingsForArg(F->getArg(0), One); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); // select - Ref = getInstCodeSize(Select); - Test = Visitor.getCodeSizeBonus(F->getArg(1), True); + Ref = getCodeSizeSavings(Select); + Test = Visitor.getCodeSizeSavingsForArg(F->getArg(1), True); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); // gep + load + freeze + smax - Ref = getInstCodeSize(Gep) + getInstCodeSize(Load) + getInstCodeSize(Freeze) + - getInstCodeSize(Smax); - Test = Visitor.getCodeSizeBonus(F->getArg(2), GV); + Ref = getCodeSizeSavings(Gep) + getCodeSizeSavings(Load) + + getCodeSizeSavings(Freeze) + getCodeSizeSavings(Smax); + Test = Visitor.getCodeSizeSavingsForArg(F->getArg(2), GV); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); - Test = Visitor.getCodeSizeBonus(F->getArg(3), Undef); + Test = Visitor.getCodeSizeSavingsForArg(F->getArg(3), Undef); EXPECT_TRUE(Test == 0); // Latency. - Ref = getLatency(F); - Test = Visitor.getLatencyBonus(); + Ref = getLatencySavings(F); + Test = Visitor.getLatencySavingsForKnownConstants(); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); } @@ -433,33 +437,34 @@ TEST_F(FunctionSpecializationTest, PhiNode) { Instruction &Icmp = *++BB.begin(); Instruction &Branch = BB.back(); - Cost Test = Visitor.getCodeSizeBonus(F->getArg(0), One); + Cost Test = Visitor.getCodeSizeSavingsForArg(F->getArg(0), One); EXPECT_TRUE(Test == 0); - Test = Visitor.getCodeSizeBonus(F->getArg(1), One); + Test = Visitor.getCodeSizeSavingsForArg(F->getArg(1), One); EXPECT_TRUE(Test == 0); - Test = Visitor.getLatencyBonus(); + Test = Visitor.getLatencySavingsForKnownConstants(); EXPECT_TRUE(Test == 0); // switch + phi + br - Cost Ref = getInstCodeSize(Switch) + - getInstCodeSize(PhiCase2, /*SizeOnly =*/true) + - getInstCodeSize(BrBB, /*SizeOnly =*/true); - Test = Visitor.getCodeSizeBonus(F->getArg(2), One); + Cost Ref = getCodeSizeSavings(Switch) + + getCodeSizeSavings(PhiCase2, /*HasLatencySavings=*/false) + + getCodeSizeSavings(BrBB, /*HasLatencySavings=*/false); + Test = Visitor.getCodeSizeSavingsForArg(F->getArg(2), One); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0 && Test > 0); // phi + phi + add + icmp + branch - Ref = getInstCodeSize(PhiBB) + getInstCodeSize(PhiLoop) + - getInstCodeSize(Add) + getInstCodeSize(Icmp) + getInstCodeSize(Branch); - Test = Visitor.getCodeSizeBonusFromPendingPHIs(); + Ref = getCodeSizeSavings(PhiBB) + getCodeSizeSavings(PhiLoop) + + getCodeSizeSavings(Add) + getCodeSizeSavings(Icmp) + + getCodeSizeSavings(Branch); + Test = Visitor.getCodeSizeSavingsFromPendingPHIs(); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); // Latency. - Ref = getLatency(F); - Test = Visitor.getLatencyBonus(); + Ref = getLatencySavings(F); + Test = Visitor.getLatencySavingsForKnownConstants(); EXPECT_EQ(Test, Ref); EXPECT_TRUE(Test > 0); } From b9336f26bc021d91a1fc1d5a2495b11124ab67f5 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Mon, 21 Oct 2024 21:49:53 +0000 Subject: [PATCH 3/4] Address review comments 2 - Fix outdated function names in comments. - Sink debug messages about profitability into the IsProfitable lambda, and also add percentage of Function size while we're here. - Move other profitability related statements into IsProfitable. - Add a TODO to improve the accumulation of codesize increases (FunctionGrowth) --- .../Transforms/IPO/FunctionSpecialization.cpp | 48 ++++++++++++------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 04a62837ff962..35a75515cca22 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -185,10 +185,10 @@ Cost InstCostVisitor::getCodeSizeSavingsForArg(Argument *A, Constant *C) { /// for all Instructions in KnownConstants at once, it should be called only /// after every instruction has been visited, i.e. after: /// -/// * getCodeSizeBonus has been run for every constant argument of a +/// * getCodeSizeSavingsForArg has been run for every constant argument of a /// specialization candidate /// -/// * getCodeSizeBonusFromPendingPHIs has been run +/// * getCodeSizeSavingsFromPendingPHIs has been run /// /// to ensure that the latency savings are calculated for all Instructions we /// have visited and found to be constant. @@ -831,6 +831,9 @@ static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) { return Clone; } +/// Get the unsigned Value of given Cost object. Assumes the Cost is always +/// non-negative, which is true for both TCK_CodeSize and TCK_Latency, and +/// always Valid. static unsigned getCostValue(const Cost &C) { int64_t Value = *C.getValue(); @@ -915,49 +918,58 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, } CodeSize += Visitor.getCodeSizeSavingsFromPendingPHIs(); - LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = " - << CodeSize << ", Inlining = " << Score << "}\n"); - - unsigned LatencySavings = 0; - unsigned CodeSizeSavings = getCostValue(CodeSize); - FunctionGrowth[F] += FuncSize - CodeSizeSavings; - - auto IsProfitable = [](unsigned CodeSizeSavings, unsigned &LatencySavings, - unsigned Score, unsigned FuncSize, - unsigned FuncGrowth, InstCostVisitor &V) -> bool { + auto IsProfitable = [&]() -> bool { // No check required. if (ForceSpecialization) return true; + + unsigned CodeSizeSavings = getCostValue(CodeSize); + // TODO: We should only accumulate codesize increase of specializations + // that are actually created. + FunctionGrowth[F] += FuncSize - CodeSizeSavings; + + LLVM_DEBUG( + dbgs() << "FnSpecialization: Specialization bonus {Inlining = " + << Score << " (" << (Score * 100 / FuncSize) << "%)}\n"); + // Minimum inlining bonus. if (Score > MinInliningBonus * FuncSize / 100) return true; + + LLVM_DEBUG( + dbgs() << "FnSpecialization: Specialization bonus {CodeSize = " + << CodeSizeSavings << " (" + << (CodeSizeSavings * 100 / FuncSize) << "%)}\n"); + // Minimum codesize savings. if (CodeSizeSavings < MinCodeSizeSavings * FuncSize / 100) return false; // Lazily compute the Latency, to avoid unnecessarily computing BFI. - LatencySavings = getCostValue(V.getLatencySavingsForKnownConstants()); + unsigned LatencySavings = + getCostValue(Visitor.getLatencySavingsForKnownConstants()); LLVM_DEBUG( dbgs() << "FnSpecialization: Specialization bonus {Latency = " - << LatencySavings << "}\n"); + << LatencySavings << " (" + << (LatencySavings * 100 / FuncSize) << "%)}\n"); // Minimum latency savings. if (LatencySavings < MinLatencySavings * FuncSize / 100) return false; // Maximum codesize growth. - if (FuncGrowth / FuncSize > MaxCodeSizeGrowth) + if (FunctionGrowth[F] / FuncSize > MaxCodeSizeGrowth) return false; + + Score += std::max(CodeSizeSavings, LatencySavings); return true; }; // Discard unprofitable specialisations. - if (!IsProfitable(CodeSizeSavings, LatencySavings, Score, FuncSize, - FunctionGrowth[F], Visitor)) + if (!IsProfitable()) continue; // Create a new specialisation entry. - Score += std::max(CodeSizeSavings, LatencySavings); auto &Spec = AllSpecs.emplace_back(F, S, Score); if (CS.getFunction() != F) Spec.CallSites.push_back(&CS); From 0a5d644a0c877d019a152a5db6d79760b8f117d0 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Tue, 22 Oct 2024 14:36:55 +0000 Subject: [PATCH 4/4] Address review 3 - Add debug statement to getLatencySavingsForKnownConstants --- llvm/lib/Transforms/IPO/FunctionSpecialization.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 35a75515cca22..20249a20a37e4 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -194,7 +194,7 @@ Cost InstCostVisitor::getCodeSizeSavingsForArg(Argument *A, Constant *C) { /// have visited and found to be constant. Cost InstCostVisitor::getLatencySavingsForKnownConstants() { auto &BFI = GetBFI(*F); - Cost Latency = 0; + Cost TotalLatency = 0; for (auto Pair : KnownConstants) { Instruction *I = dyn_cast(Pair.first); @@ -203,11 +203,17 @@ Cost InstCostVisitor::getLatencySavingsForKnownConstants() { uint64_t Weight = BFI.getBlockFreq(I->getParent()).getFrequency() / BFI.getEntryFreq().getFrequency(); - Latency += + + Cost Latency = Weight * TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency); + + LLVM_DEBUG(dbgs() << "FnSpecialization: {Latency = " << Latency + << "} for instruction " << *I << "\n"); + + TotalLatency += Latency; } - return Latency; + return TotalLatency; } Cost InstCostVisitor::getCodeSizeSavingsForUser(Instruction *User, Value *Use,