Skip to content

Commit feaef62

Browse files
committed
[LoopVectorize] Refine runtime memory check costs when there is an outer loop
When we generate runtime memory checks for an inner loop it's possible that these checks are invariant in the outer loop and so will get hoisted out. In such cases, the effective cost of the checks should reduce to reflect the outer loop trip count. This fixes a 25% performance regression introduced by commit 49b0e6d when building the SPEC2017 x264 benchmark with PGO, where we decided the inner loop trip count wasn't high enough to warrant the (incorrect) high cost of the runtime checks. Also, when runtime memory checks consist entirely of diff checks these are likely to be outer loop invariant.
1 parent ea50e94 commit feaef62

File tree

2 files changed

+63
-8
lines changed

2 files changed

+63
-8
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+52-2
Original file line numberDiff line numberDiff line change
@@ -1957,6 +1957,8 @@ class GeneratedRTChecks {
19571957
bool CostTooHigh = false;
19581958
const bool AddBranchWeights;
19591959

1960+
Loop *OuterLoop = nullptr;
1961+
19601962
public:
19611963
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
19621964
TargetTransformInfo *TTI, const DataLayout &DL,
@@ -2053,6 +2055,9 @@ class GeneratedRTChecks {
20532055
DT->eraseNode(SCEVCheckBlock);
20542056
LI->removeBlock(SCEVCheckBlock);
20552057
}
2058+
2059+
// Outer loop is used as part of the later cost calculations.
2060+
OuterLoop = L->getParentLoop();
20562061
}
20572062

20582063
InstructionCost getCost() {
@@ -2076,16 +2081,61 @@ class GeneratedRTChecks {
20762081
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
20772082
RTCheckCost += C;
20782083
}
2079-
if (MemCheckBlock)
2084+
if (MemCheckBlock) {
2085+
InstructionCost MemCheckCost = 0;
20802086
for (Instruction &I : *MemCheckBlock) {
20812087
if (MemCheckBlock->getTerminator() == &I)
20822088
continue;
20832089
InstructionCost C =
20842090
TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
20852091
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2086-
RTCheckCost += C;
2092+
MemCheckCost += C;
20872093
}
20882094

2095+
// If the runtime memory checks are being created inside an outer loop
2096+
// we should find out if these checks are outer loop invariant. If so,
2097+
// the checks will likely be hoisted out and so the effective cost will
2098+
// reduce according to the outer loop trip count.
2099+
if (OuterLoop) {
2100+
ScalarEvolution *SE = MemCheckExp.getSE();
2101+
// TODO: If profitable, we could refine this further by analysing every
2102+
// individual memory check, since there could be a mixture of loop
2103+
// variant and invariant checks that mean the final condition is
2104+
// variant.
2105+
const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2106+
if (SE->isLoopInvariant(Cond, OuterLoop)) {
2107+
// It seems reasonable to assume that we can reduce the effective
2108+
// cost of the checks even when we know nothing about the trip
2109+
// count. Assume that the outer loop executes at least twice.
2110+
unsigned BestTripCount = 2;
2111+
2112+
// If exact trip count is known use that.
2113+
if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2114+
BestTripCount = SmallTC;
2115+
else if (LoopVectorizeWithBlockFrequency) {
2116+
// Else use profile data if available.
2117+
if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2118+
BestTripCount = *EstimatedTC;
2119+
}
2120+
2121+
InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2122+
2123+
// Let's ensure the cost is always at least 1.
2124+
NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2125+
(InstructionCost::CostType)1);
2126+
2127+
LLVM_DEBUG(dbgs()
2128+
<< "We expect runtime memory checks to be hoisted "
2129+
<< "out of the outer loop. Cost reduced from "
2130+
<< MemCheckCost << " to " << NewMemCheckCost << '\n');
2131+
2132+
MemCheckCost = NewMemCheckCost;
2133+
}
2134+
}
2135+
2136+
RTCheckCost += MemCheckCost;
2137+
}
2138+
20892139
if (SCEVCheckBlock || MemCheckBlock)
20902140
LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
20912141
<< "\n");

llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

+11-6
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ inner.exit:
3232
define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
3333
; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
3434
; CHECK: Calculating cost of runtime checks:
35-
; CHECK: Total cost of runtime checks: 6
35+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
36+
; CHECK: Total cost of runtime checks: 3
3637
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
3738
entry:
3839
br label %outer.loop
@@ -68,7 +69,8 @@ outer.exit:
6869
define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
6970
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
7071
; CHECK: Calculating cost of runtime checks:
71-
; CHECK: Total cost of runtime checks: 6
72+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
73+
; CHECK: Total cost of runtime checks: 2
7274
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
7375
entry:
7476
br label %outer.loop
@@ -104,7 +106,8 @@ outer.exit:
104106
define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
105107
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
106108
; CHECK: Calculating cost of runtime checks:
107-
; CHECK: Total cost of runtime checks: 6
109+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1
110+
; CHECK: Total cost of runtime checks: 1
108111
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
109112
entry:
110113
br label %outer.loop
@@ -140,7 +143,8 @@ outer.exit:
140143
define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
141144
; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
142145
; CHECK: Calculating cost of runtime checks:
143-
; CHECK: Total cost of runtime checks: 6
146+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
147+
; CHECK: Total cost of runtime checks: 2
144148
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
145149
entry:
146150
br label %outer.loop
@@ -176,8 +180,9 @@ outer.exit:
176180
define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
177181
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
178182
; CHECK: Calculating cost of runtime checks:
179-
; CHECK: Total cost of runtime checks: 6
180-
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
183+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
184+
; CHECK: Total cost of runtime checks: 2
185+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
181186
entry:
182187
br label %outer.loop
183188

0 commit comments

Comments
 (0)