Skip to content

[FuncSpec] Only compute Latency bonus when necessary #113159

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 13 additions & 42 deletions llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,41 +140,10 @@ struct Spec {
: F(F), Sig(S), Score(Score) {}
};

struct Bonus {
unsigned CodeSize = 0;
unsigned Latency = 0;

Bonus() = default;

Bonus(Cost CodeSize, Cost Latency) {
int64_t Sz = *CodeSize.getValue();
int64_t Ltc = *Latency.getValue();

assert(Sz >= 0 && Ltc >= 0 && "CodeSize and Latency cannot be negative");
// It is safe to down cast since we know the arguments
// cannot be negative and Cost is of type int64_t.
this->CodeSize = static_cast<unsigned>(Sz);
this->Latency = static_cast<unsigned>(Ltc);
}

Bonus &operator+=(const Bonus RHS) {
CodeSize += RHS.CodeSize;
Latency += RHS.Latency;
return *this;
}

Bonus operator+(const Bonus RHS) const {
return Bonus(CodeSize + RHS.CodeSize, Latency + RHS.Latency);
}

bool operator==(const Bonus RHS) const {
return CodeSize == RHS.CodeSize && Latency == RHS.Latency;
}
};

class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
std::function<BlockFrequencyInfo &(Function &)> GetBFI;
Function *F;
const DataLayout &DL;
BlockFrequencyInfo &BFI;
TargetTransformInfo &TTI;
SCCPSolver &Solver;

Expand All @@ -192,26 +161,29 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
ConstMap::iterator LastVisited;

public:
InstCostVisitor(const DataLayout &DL, BlockFrequencyInfo &BFI,
TargetTransformInfo &TTI, SCCPSolver &Solver)
: DL(DL), BFI(BFI), TTI(TTI), Solver(Solver) {}
InstCostVisitor(std::function<BlockFrequencyInfo &(Function &)> GetBFI,
Function *F, const DataLayout &DL, TargetTransformInfo &TTI,
SCCPSolver &Solver)
: GetBFI(GetBFI), F(F), DL(DL), TTI(TTI), Solver(Solver) {}

bool isBlockExecutable(BasicBlock *BB) {
return Solver.isBlockExecutable(BB) && !DeadBlocks.contains(BB);
}

Bonus getSpecializationBonus(Argument *A, Constant *C);
Cost getCodeSizeSavingsForArg(Argument *A, Constant *C);

Cost getCodeSizeSavingsFromPendingPHIs();

Bonus getBonusFromPendingPHIs();
Cost getLatencySavingsForKnownConstants();

private:
friend class InstVisitor<InstCostVisitor, Constant *>;

static bool canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
DenseSet<BasicBlock *> &DeadBlocks);

Bonus getUserBonus(Instruction *User, Value *Use = nullptr,
Constant *C = nullptr);
Cost getCodeSizeSavingsForUser(Instruction *User, Value *Use = nullptr,
Constant *C = nullptr);

Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList);
Cost estimateSwitchInst(SwitchInst &I);
Expand Down Expand Up @@ -283,9 +255,8 @@ class FunctionSpecializer {
bool run();

InstCostVisitor getInstCostVisitorFor(Function *F) {
auto &BFI = GetBFI(*F);
auto &TTI = GetTTI(*F);
return InstCostVisitor(M.getDataLayout(), BFI, TTI, Solver);
return InstCostVisitor(GetBFI, F, M.getDataLayout(), TTI, Solver);
}

private:
Expand Down
145 changes: 102 additions & 43 deletions llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
Cost InstCostVisitor::estimateBasicBlocks(
SmallVectorImpl<BasicBlock *> &WorkList) {
Cost CodeSize = 0;
// Accumulate the instruction cost of each basic block weighted by frequency.
// Accumulate the codesize savings of each basic block.
while (!WorkList.empty()) {
BasicBlock *BB = WorkList.pop_back_val();

Expand Down Expand Up @@ -154,37 +154,73 @@ static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) {
return KnownConstants.lookup(V);
}

Bonus InstCostVisitor::getBonusFromPendingPHIs() {
Bonus B;
Cost InstCostVisitor::getCodeSizeSavingsFromPendingPHIs() {
Cost CodeSize;
while (!PendingPHIs.empty()) {
Instruction *Phi = PendingPHIs.pop_back_val();
// The pending PHIs could have been proven dead by now.
if (isBlockExecutable(Phi->getParent()))
B += getUserBonus(Phi);
CodeSize += getCodeSizeSavingsForUser(Phi);
}
return B;
return CodeSize;
}

/// Compute a bonus for replacing argument \p A with constant \p C.
Bonus InstCostVisitor::getSpecializationBonus(Argument *A, Constant *C) {
/// Compute the codesize savings for replacing argument \p A with constant \p C.
Cost InstCostVisitor::getCodeSizeSavingsForArg(Argument *A, Constant *C) {
LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
<< C->getNameOrAsOperand() << "\n");
Bonus B;
Cost CodeSize;
for (auto *U : A->users())
if (auto *UI = dyn_cast<Instruction>(U))
if (isBlockExecutable(UI->getParent()))
B += getUserBonus(UI, A, C);
CodeSize += getCodeSizeSavingsForUser(UI, A, C);

LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated bonus {CodeSize = "
<< B.CodeSize << ", Latency = " << B.Latency
<< "} for argument " << *A << "\n");
return B;
<< CodeSize << "} for argument " << *A << "\n");
return CodeSize;
}

Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
/// Compute the latency savings from replacing all arguments with constants for
/// a specialization candidate. As this function computes the latency savings
/// for all Instructions in KnownConstants at once, it should be called only
/// after every instruction has been visited, i.e. after:
///
/// * getCodeSizeSavingsForArg has been run for every constant argument of a
/// specialization candidate
///
/// * getCodeSizeSavingsFromPendingPHIs has been run
///
/// to ensure that the latency savings are calculated for all Instructions we
/// have visited and found to be constant.
Cost InstCostVisitor::getLatencySavingsForKnownConstants() {
auto &BFI = GetBFI(*F);
Cost TotalLatency = 0;

for (auto Pair : KnownConstants) {
Instruction *I = dyn_cast<Instruction>(Pair.first);
if (!I)
continue;

uint64_t Weight = BFI.getBlockFreq(I->getParent()).getFrequency() /
BFI.getEntryFreq().getFrequency();

Cost Latency =
Weight * TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);

LLVM_DEBUG(dbgs() << "FnSpecialization: {Latency = " << Latency
<< "} for instruction " << *I << "\n");

TotalLatency += Latency;
}

return TotalLatency;
}

Cost InstCostVisitor::getCodeSizeSavingsForUser(Instruction *User, Value *Use,
Constant *C) {
// We have already propagated a constant for this user.
if (KnownConstants.contains(User))
return {0, 0};
return 0;

// Cache the iterator before visiting.
LastVisited = Use ? KnownConstants.insert({Use, C}).first
Expand All @@ -198,7 +234,7 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C)
} else {
C = visit(*User);
if (!C)
return {0, 0};
return 0;
}

// Even though it doesn't make sense to bind switch and branch instructions
Expand All @@ -208,23 +244,15 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C)

CodeSize += TTI.getInstructionCost(User, TargetTransformInfo::TCK_CodeSize);

uint64_t Weight = BFI.getBlockFreq(User->getParent()).getFrequency() /
BFI.getEntryFreq().getFrequency();

Cost Latency = Weight *
TTI.getInstructionCost(User, TargetTransformInfo::TCK_Latency);

LLVM_DEBUG(dbgs() << "FnSpecialization: {CodeSize = " << CodeSize
<< ", Latency = " << Latency << "} for user "
<< *User << "\n");
<< "} for user " << *User << "\n");

Bonus B(CodeSize, Latency);
for (auto *U : User->users())
if (auto *UI = dyn_cast<Instruction>(U))
if (UI != User && isBlockExecutable(UI->getParent()))
B += getUserBonus(UI, User, C);
CodeSize += getCodeSizeSavingsForUser(UI, User, C);

return B;
return CodeSize;
}

Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
Expand Down Expand Up @@ -809,6 +837,18 @@ static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) {
return Clone;
}

/// Get the unsigned Value of given Cost object. Assumes the Cost is always
/// non-negative, which is true for both TCK_CodeSize and TCK_Latency, and
/// always Valid.
static unsigned getCostValue(const Cost &C) {
int64_t Value = *C.getValue();

assert(Value >= 0 && "CodeSize and Latency cannot be negative");
// It is safe to down cast since we know the arguments cannot be negative and
// Cost is of type int64_t.
return static_cast<unsigned>(Value);
}

bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
SmallVectorImpl<Spec> &AllSpecs,
SpecMap &SM) {
Expand Down Expand Up @@ -875,48 +915,67 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
AllSpecs[Index].CallSites.push_back(&CS);
} else {
// Calculate the specialisation gain.
Bonus B;
Cost CodeSize;
unsigned Score = 0;
InstCostVisitor Visitor = getInstCostVisitorFor(F);
for (ArgInfo &A : S.Args) {
B += Visitor.getSpecializationBonus(A.Formal, A.Actual);
CodeSize += Visitor.getCodeSizeSavingsForArg(A.Formal, A.Actual);
Score += getInliningBonus(A.Formal, A.Actual);
}
B += Visitor.getBonusFromPendingPHIs();
CodeSize += Visitor.getCodeSizeSavingsFromPendingPHIs();


LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
<< B.CodeSize << ", Latency = " << B.Latency
<< ", Inlining = " << Score << "}\n");

FunctionGrowth[F] += FuncSize - B.CodeSize;

auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize,
unsigned FuncGrowth) -> bool {
auto IsProfitable = [&]() -> bool {
// No check required.
if (ForceSpecialization)
return true;

unsigned CodeSizeSavings = getCostValue(CodeSize);
// TODO: We should only accumulate codesize increase of specializations
// that are actually created.
FunctionGrowth[F] += FuncSize - CodeSizeSavings;

LLVM_DEBUG(
dbgs() << "FnSpecialization: Specialization bonus {Inlining = "
<< Score << " (" << (Score * 100 / FuncSize) << "%)}\n");

// Minimum inlining bonus.
if (Score > MinInliningBonus * FuncSize / 100)
return true;

LLVM_DEBUG(
dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
<< CodeSizeSavings << " ("
<< (CodeSizeSavings * 100 / FuncSize) << "%)}\n");

// Minimum codesize savings.
if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100)
if (CodeSizeSavings < MinCodeSizeSavings * FuncSize / 100)
return false;

// Lazily compute the Latency, to avoid unnecessarily computing BFI.
unsigned LatencySavings =
getCostValue(Visitor.getLatencySavingsForKnownConstants());

LLVM_DEBUG(
dbgs() << "FnSpecialization: Specialization bonus {Latency = "
<< LatencySavings << " ("
<< (LatencySavings * 100 / FuncSize) << "%)}\n");

// Minimum latency savings.
if (B.Latency < MinLatencySavings * FuncSize / 100)
if (LatencySavings < MinLatencySavings * FuncSize / 100)
return false;
// Maximum codesize growth.
if (FuncGrowth / FuncSize > MaxCodeSizeGrowth)
if (FunctionGrowth[F] / FuncSize > MaxCodeSizeGrowth)
return false;

Score += std::max(CodeSizeSavings, LatencySavings);
return true;
};

// Discard unprofitable specialisations.
if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F]))
if (!IsProfitable())
continue;

// Create a new specialisation entry.
Score += std::max(B.CodeSize, B.Latency);
auto &Spec = AllSpecs.emplace_back(F, S, Score);
if (CS.getFunction() != F)
Spec.CallSites.push_back(&CS);
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,25 @@

; This test case is trying to validate that the postdomtree is preserved
; correctly by the ipsccp pass. A tricky bug was introduced in commit
; 1b1232047e83b69561 when PDT would be feched using getCachedAnalysis in order
; 1b1232047e83b69561 when PDT would be fetched using getCachedAnalysis in order
; to setup a DomTreeUpdater (to update the PDT during transformation in order
; to preserve the analysis). But given that commit the PDT could end up being
; required and calculated via BlockFrequency analysis. So the problem was that
; when setting up the DomTreeUpdater we used a nullptr in case PDT wasn't
; cached at the begininng of IPSCCP, to indicate that no updates where needed
; cached at the beginning of IPSCCP, to indicate that no updates were needed
; for PDT. But then the PDT was calculated, given the input IR, and preserved
; using the non-updated state (as the DTU wasn't configured for updating the
; PDT).

; CHECK-NOT: <badref>
; CHECK: Inorder PostDominator Tree: DFSNumbers invalid: 0 slow queries.
; CHECK-NEXT: [1] <<exit node>> {4294967295,4294967295} [0]
; CHECK-NEXT: [2] %for.cond34 {4294967295,4294967295} [1]
; CHECK-NEXT: [3] %for.cond16 {4294967295,4294967295} [2]
; CHECK-NEXT: [2] %for.body {4294967295,4294967295} [1]
; CHECK-NEXT: [2] %if.end4 {4294967295,4294967295} [1]
; CHECK-NEXT: [3] %entry {4294967295,4294967295} [2]
; CHECK-NEXT: Roots: %for.cond34 %for.body
; CHECK-NEXT: [1] <<exit node>> {4294967295,4294967295} [0]
; CHECK-NEXT: [2] %for.body {4294967295,4294967295} [1]
; CHECK-NEXT: [2] %if.end4 {4294967295,4294967295} [1]
; CHECK-NEXT: [3] %entry {4294967295,4294967295} [2]
; CHECK-NEXT: [2] %for.cond34 {4294967295,4294967295} [1]
; CHECK-NEXT: [3] %for.cond16 {4294967295,4294967295} [2]
; CHECK-NEXT: Roots: %for.body %for.cond34
; CHECK-NEXT: PostDominatorTree for function: bar
; CHECK-NOT: <badref>

Expand Down
Loading
Loading