Skip to content

Commit bdaf16d

Browse files
david-armtstellar
authored andcommitted
[LoopVectorize] Refine runtime memory check costs when there is an outer loop (#76034)
When we generate runtime memory checks for an inner loop it's possible that these checks are invariant in the outer loop and so will get hoisted out. In such cases, the effective cost of the checks should reduce to reflect the outer loop trip count. This fixes a 25% performance regression introduced by commit 49b0e6d when building the SPEC2017 x264 benchmark with PGO, where we decided the inner loop trip count wasn't high enough to warrant the (incorrect) high cost of the runtime checks. Also, when runtime memory checks consist entirely of diff checks these are likely to be outer loop invariant. (cherry picked from commit 962fbaf)
1 parent 824a3e5 commit bdaf16d

File tree

2 files changed

+273
-6
lines changed

2 files changed

+273
-6
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+56-6
Original file line numberDiff line numberDiff line change
@@ -1957,6 +1957,8 @@ class GeneratedRTChecks {
19571957
bool CostTooHigh = false;
19581958
const bool AddBranchWeights;
19591959

1960+
Loop *OuterLoop = nullptr;
1961+
19601962
public:
19611963
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
19621964
TargetTransformInfo *TTI, const DataLayout &DL,
@@ -2053,6 +2055,9 @@ class GeneratedRTChecks {
20532055
DT->eraseNode(SCEVCheckBlock);
20542056
LI->removeBlock(SCEVCheckBlock);
20552057
}
2058+
2059+
// Outer loop is used as part of the later cost calculations.
2060+
OuterLoop = L->getParentLoop();
20562061
}
20572062

20582063
InstructionCost getCost() {
@@ -2076,16 +2081,61 @@ class GeneratedRTChecks {
20762081
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
20772082
RTCheckCost += C;
20782083
}
2079-
if (MemCheckBlock)
2084+
if (MemCheckBlock) {
2085+
InstructionCost MemCheckCost = 0;
20802086
for (Instruction &I : *MemCheckBlock) {
20812087
if (MemCheckBlock->getTerminator() == &I)
20822088
continue;
20832089
InstructionCost C =
20842090
TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
20852091
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2086-
RTCheckCost += C;
2092+
MemCheckCost += C;
20872093
}
20882094

2095+
// If the runtime memory checks are being created inside an outer loop
2096+
// we should find out if these checks are outer loop invariant. If so,
2097+
// the checks will likely be hoisted out and so the effective cost will
2098+
// reduce according to the outer loop trip count.
2099+
if (OuterLoop) {
2100+
ScalarEvolution *SE = MemCheckExp.getSE();
2101+
// TODO: If profitable, we could refine this further by analysing every
2102+
// individual memory check, since there could be a mixture of loop
2103+
// variant and invariant checks that mean the final condition is
2104+
// variant.
2105+
const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2106+
if (SE->isLoopInvariant(Cond, OuterLoop)) {
2107+
// It seems reasonable to assume that we can reduce the effective
2108+
// cost of the checks even when we know nothing about the trip
2109+
// count. Assume that the outer loop executes at least twice.
2110+
unsigned BestTripCount = 2;
2111+
2112+
// If exact trip count is known use that.
2113+
if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2114+
BestTripCount = SmallTC;
2115+
else if (LoopVectorizeWithBlockFrequency) {
2116+
// Else use profile data if available.
2117+
if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2118+
BestTripCount = *EstimatedTC;
2119+
}
2120+
2121+
InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2122+
2123+
// Let's ensure the cost is always at least 1.
2124+
NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2125+
(InstructionCost::CostType)1);
2126+
2127+
LLVM_DEBUG(dbgs()
2128+
<< "We expect runtime memory checks to be hoisted "
2129+
<< "out of the outer loop. Cost reduced from "
2130+
<< MemCheckCost << " to " << NewMemCheckCost << '\n');
2131+
2132+
MemCheckCost = NewMemCheckCost;
2133+
}
2134+
}
2135+
2136+
RTCheckCost += MemCheckCost;
2137+
}
2138+
20892139
if (SCEVCheckBlock || MemCheckBlock)
20902140
LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
20912141
<< "\n");
@@ -2144,8 +2194,8 @@ class GeneratedRTChecks {
21442194

21452195
BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
21462196
// Create new preheader for vector loop.
2147-
if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2148-
PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2197+
if (OuterLoop)
2198+
OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
21492199

21502200
SCEVCheckBlock->getTerminator()->eraseFromParent();
21512201
SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
@@ -2179,8 +2229,8 @@ class GeneratedRTChecks {
21792229
DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
21802230
MemCheckBlock->moveBefore(LoopVectorPreHeader);
21812231

2182-
if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2183-
PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2232+
if (OuterLoop)
2233+
OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
21842234

21852235
BranchInst &BI =
21862236
*BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
; REQUIRES: asserts
2+
; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s
3+
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %off, i64 noundef %n) {
7+
; CHECK-LABEL: LV: Checking a loop in 'no_outer_loop'
8+
; CHECK: Calculating cost of runtime checks:
9+
; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop.
10+
; CHECK: Total cost of runtime checks: 4
11+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
12+
entry:
13+
br label %inner.loop
14+
15+
inner.loop:
16+
%inner.iv = phi i64 [ 0, %entry ], [ %inner.iv.next, %inner.loop ]
17+
%add.us = add nuw nsw i64 %inner.iv, %off
18+
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
19+
%0 = load i8, ptr %arrayidx.us, align 1
20+
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
21+
%1 = load i8, ptr %arrayidx7.us, align 1
22+
%add9.us = add i8 %1, %0
23+
store i8 %add9.us, ptr %arrayidx7.us, align 1
24+
%inner.iv.next = add nuw nsw i64 %inner.iv, 1
25+
%exitcond.not = icmp eq i64 %inner.iv.next, %n
26+
br i1 %exitcond.not, label %inner.exit, label %inner.loop
27+
28+
inner.exit:
29+
ret void
30+
}
31+
32+
define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
33+
; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
34+
; CHECK: Calculating cost of runtime checks:
35+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
36+
; CHECK: Total cost of runtime checks: 3
37+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
38+
entry:
39+
br label %outer.loop
40+
41+
outer.loop:
42+
%outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
43+
%mul.us = mul nsw i64 %outer.iv, %n
44+
br label %inner.loop
45+
46+
inner.loop:
47+
%inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
48+
%add.us = add nuw nsw i64 %inner.iv, %mul.us
49+
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
50+
%0 = load i8, ptr %arrayidx.us, align 1
51+
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
52+
%1 = load i8, ptr %arrayidx7.us, align 1
53+
%add9.us = add i8 %1, %0
54+
store i8 %add9.us, ptr %arrayidx7.us, align 1
55+
%inner.iv.next = add nuw nsw i64 %inner.iv, 1
56+
%exitcond.not = icmp eq i64 %inner.iv.next, %n
57+
br i1 %exitcond.not, label %inner.exit, label %inner.loop
58+
59+
inner.exit:
60+
%outer.iv.next = add nuw nsw i64 %outer.iv, 1
61+
%exitcond27.not = icmp eq i64 %outer.iv.next, %m
62+
br i1 %exitcond27.not, label %outer.exit, label %outer.loop
63+
64+
outer.exit:
65+
ret void
66+
}
67+
68+
69+
define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
70+
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
71+
; CHECK: Calculating cost of runtime checks:
72+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
73+
; CHECK: Total cost of runtime checks: 2
74+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
75+
entry:
76+
br label %outer.loop
77+
78+
outer.loop:
79+
%outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
80+
%mul.us = mul nsw i64 %outer.iv, %n
81+
br label %inner.loop
82+
83+
inner.loop:
84+
%inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
85+
%add.us = add nuw nsw i64 %inner.iv, %mul.us
86+
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
87+
%0 = load i8, ptr %arrayidx.us, align 1
88+
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
89+
%1 = load i8, ptr %arrayidx7.us, align 1
90+
%add9.us = add i8 %1, %0
91+
store i8 %add9.us, ptr %arrayidx7.us, align 1
92+
%inner.iv.next = add nuw nsw i64 %inner.iv, 1
93+
%exitcond.not = icmp eq i64 %inner.iv.next, %n
94+
br i1 %exitcond.not, label %inner.exit, label %inner.loop
95+
96+
inner.exit:
97+
%outer.iv.next = add nuw nsw i64 %outer.iv, 1
98+
%exitcond26.not = icmp eq i64 %outer.iv.next, 3
99+
br i1 %exitcond26.not, label %outer.exit, label %outer.loop
100+
101+
outer.exit:
102+
ret void
103+
}
104+
105+
106+
define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
107+
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
108+
; CHECK: Calculating cost of runtime checks:
109+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1
110+
; CHECK: Total cost of runtime checks: 1
111+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
112+
entry:
113+
br label %outer.loop
114+
115+
outer.loop:
116+
%outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
117+
%mul.us = mul nsw i64 %outer.iv, %n
118+
br label %inner.loop
119+
120+
inner.loop:
121+
%inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
122+
%add.us = add nuw nsw i64 %inner.iv, %mul.us
123+
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
124+
%0 = load i8, ptr %arrayidx.us, align 1
125+
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
126+
%1 = load i8, ptr %arrayidx7.us, align 1
127+
%add9.us = add i8 %1, %0
128+
store i8 %add9.us, ptr %arrayidx7.us, align 1
129+
%inner.iv.next = add nuw nsw i64 %inner.iv, 1
130+
%exitcond.not = icmp eq i64 %inner.iv.next, %n
131+
br i1 %exitcond.not, label %inner.exit, label %inner.loop
132+
133+
inner.exit:
134+
%outer.iv.next = add nuw nsw i64 %outer.iv, 1
135+
%exitcond26.not = icmp eq i64 %outer.iv.next, 64
136+
br i1 %exitcond26.not, label %outer.exit, label %outer.loop
137+
138+
outer.exit:
139+
ret void
140+
}
141+
142+
143+
define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
144+
; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
145+
; CHECK: Calculating cost of runtime checks:
146+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
147+
; CHECK: Total cost of runtime checks: 2
148+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
149+
entry:
150+
br label %outer.loop
151+
152+
outer.loop:
153+
%outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
154+
%mul.us = mul nsw i64 %outer.iv, %n
155+
br label %inner.loop
156+
157+
inner.loop:
158+
%inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
159+
%add.us = add nuw nsw i64 %inner.iv, %mul.us
160+
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
161+
%0 = load i8, ptr %arrayidx.us, align 1
162+
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
163+
%1 = load i8, ptr %arrayidx7.us, align 1
164+
%add9.us = add i8 %1, %0
165+
store i8 %add9.us, ptr %arrayidx7.us, align 1
166+
%inner.iv.next = add nuw nsw i64 %inner.iv, 1
167+
%exitcond.not = icmp eq i64 %inner.iv.next, %n
168+
br i1 %exitcond.not, label %inner.exit, label %inner.loop
169+
170+
inner.exit:
171+
%outer.iv.next = add nuw nsw i64 %outer.iv, 1
172+
%exitcond26.not = icmp eq i64 %outer.iv.next, %m
173+
br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0
174+
175+
outer.exit:
176+
ret void
177+
}
178+
179+
180+
define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
181+
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
182+
; CHECK: Calculating cost of runtime checks:
183+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
184+
; CHECK: Total cost of runtime checks: 2
185+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
186+
entry:
187+
br label %outer.loop
188+
189+
outer.loop:
190+
%outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
191+
%0 = mul nsw i64 %outer.iv, %n
192+
br label %inner.loop
193+
194+
inner.loop:
195+
%iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
196+
%1 = add nuw nsw i64 %iv.inner, %0
197+
%arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1
198+
%2 = load i32, ptr %arrayidx.us, align 4
199+
%arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1
200+
%3 = load i32, ptr %arrayidx8.us, align 4
201+
%add9.us = add nsw i32 %3, %2
202+
store i32 %add9.us, ptr %arrayidx8.us, align 4
203+
%iv.inner.next = add nuw nsw i64 %iv.inner, 1
204+
%inner.exit.cond = icmp eq i64 %iv.inner.next, %n
205+
br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
206+
207+
inner.exit:
208+
%outer.iv.next = add nuw nsw i64 %outer.iv, 1
209+
%outer.exit.cond = icmp eq i64 %outer.iv.next, 3
210+
br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
211+
212+
outer.exit:
213+
ret void
214+
}
215+
216+
217+
!0 = !{!"branch_weights", i32 10, i32 20}

0 commit comments

Comments
 (0)