Skip to content

Commit 00c198b

Browse files
authored
[MachinePipeliner] Make Recurrence MII More Accurate (#105475)
Current RecMII calculation is bigger than it needs to be. The calculation was refined in this patch.
1 parent 7d7d2d2 commit 00c198b

File tree

4 files changed

+110
-26
lines changed

4 files changed

+110
-26
lines changed

llvm/include/llvm/CodeGen/MachinePipeliner.h

Lines changed: 52 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
197197
}
198198

199199
void createAdjacencyStructure(SwingSchedulerDAG *DAG);
200-
bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
200+
bool circuit(int V, int S, NodeSetType &NodeSets,
201+
const SwingSchedulerDAG *DAG, bool HasBackedge = false);
201202
void unblock(int U);
202203
};
203204

@@ -260,7 +261,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
260261
return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
261262
}
262263

263-
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
264+
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep,
265+
bool isSucc = true) const;
264266

265267
/// The distance function, which indicates that operation V of iteration I
266268
/// depends on operations U of iteration I-distance.
@@ -311,7 +313,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
311313
void computeNodeOrder(NodeSetType &NodeSets);
312314
void checkValidNodeOrder(const NodeSetType &Circuits) const;
313315
bool schedulePipeline(SMSchedule &Schedule);
314-
bool computeDelta(MachineInstr &MI, unsigned &Delta);
316+
bool computeDelta(MachineInstr &MI, unsigned &Delta) const;
315317
MachineInstr *findDefInLoop(Register Reg);
316318
bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
317319
unsigned &OffsetPos, unsigned &NewBase,
@@ -339,24 +341,56 @@ class NodeSet {
339341
using iterator = SetVector<SUnit *>::const_iterator;
340342

341343
NodeSet() = default;
342-
NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
343-
Latency = 0;
344-
for (const SUnit *Node : Nodes) {
345-
DenseMap<SUnit *, unsigned> SuccSUnitLatency;
346-
for (const SDep &Succ : Node->Succs) {
347-
auto SuccSUnit = Succ.getSUnit();
348-
if (!Nodes.count(SuccSUnit))
344+
NodeSet(iterator S, iterator E, const SwingSchedulerDAG *DAG)
345+
: Nodes(S, E), HasRecurrence(true) {
346+
// Calculate the latency of this node set.
347+
// Example to demonstrate the calculation:
348+
// Given: N0 -> N1 -> N2 -> N0
349+
// Edges:
350+
// (N0 -> N1, 3)
351+
// (N0 -> N1, 5)
352+
// (N1 -> N2, 2)
353+
// (N2 -> N0, 1)
354+
// The total latency which is a lower bound of the recurrence MII is the
355+
// longest path from N0 back to N0 given only the edges of this node set.
356+
// In this example, the latency is: 5 + 2 + 1 = 8.
357+
//
358+
// Hold a map from each SUnit in the circle to the maximum distance from the
359+
// source node by only considering the nodes.
360+
DenseMap<SUnit *, unsigned> SUnitToDistance;
361+
for (auto *Node : Nodes)
362+
SUnitToDistance[Node] = 0;
363+
364+
for (unsigned I = 1, E = Nodes.size(); I <= E; ++I) {
365+
SUnit *U = Nodes[I - 1];
366+
SUnit *V = Nodes[I % Nodes.size()];
367+
for (const SDep &Succ : U->Succs) {
368+
SUnit *SuccSUnit = Succ.getSUnit();
369+
if (V != SuccSUnit)
349370
continue;
350-
unsigned CurLatency = Succ.getLatency();
351-
unsigned MaxLatency = 0;
352-
if (SuccSUnitLatency.count(SuccSUnit))
353-
MaxLatency = SuccSUnitLatency[SuccSUnit];
354-
if (CurLatency > MaxLatency)
355-
SuccSUnitLatency[SuccSUnit] = CurLatency;
371+
if (SUnitToDistance[U] + Succ.getLatency() > SUnitToDistance[V]) {
372+
SUnitToDistance[V] = SUnitToDistance[U] + Succ.getLatency();
373+
}
356374
}
357-
for (auto SUnitLatency : SuccSUnitLatency)
358-
Latency += SUnitLatency.second;
359375
}
376+
// Handle a back-edge in loop carried dependencies
377+
SUnit *FirstNode = Nodes[0];
378+
SUnit *LastNode = Nodes[Nodes.size() - 1];
379+
380+
for (auto &PI : LastNode->Preds) {
381+
// If we have an order dep that is potentially loop carried then a
382+
// back-edge exists between the last node and the first node that isn't
383+
// modeled in the DAG. Handle it manually by adding 1 to the distance of
384+
// the last node.
385+
if (PI.getSUnit() != FirstNode || PI.getKind() != SDep::Order ||
386+
!DAG->isLoopCarriedDep(LastNode, PI, false))
387+
continue;
388+
SUnitToDistance[FirstNode] =
389+
std::max(SUnitToDistance[FirstNode], SUnitToDistance[LastNode] + 1);
390+
}
391+
392+
// The latency is the distance from the source node to itself.
393+
Latency = SUnitToDistance[Nodes.front()];
360394
}
361395

362396
bool insert(SUnit *SU) { return Nodes.insert(SU); }

llvm/lib/CodeGen/MachinePipeliner.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1706,6 +1706,7 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
17061706
/// Identify an elementary circuit in the dependence graph starting at the
17071707
/// specified node.
17081708
bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1709+
const SwingSchedulerDAG *DAG,
17091710
bool HasBackedge) {
17101711
SUnit *SV = &SUnits[V];
17111712
bool F = false;
@@ -1719,12 +1720,13 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
17191720
continue;
17201721
if (W == S) {
17211722
if (!HasBackedge)
1722-
NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1723+
NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
17231724
F = true;
17241725
++NumPaths;
17251726
break;
1726-
} else if (!Blocked.test(W)) {
1727-
if (circuit(W, S, NodeSets,
1727+
}
1728+
if (!Blocked.test(W)) {
1729+
if (circuit(W, S, NodeSets, DAG,
17281730
Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
17291731
F = true;
17301732
}
@@ -1767,9 +1769,9 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
17671769
Circuits Cir(SUnits, Topo);
17681770
// Create the adjacency structure.
17691771
Cir.createAdjacencyStructure(this);
1770-
for (int i = 0, e = SUnits.size(); i != e; ++i) {
1772+
for (int I = 0, E = SUnits.size(); I != E; ++I) {
17711773
Cir.reset();
1772-
Cir.circuit(i, i, NodeSets);
1774+
Cir.circuit(I, I, NodeSets, this);
17731775
}
17741776

17751777
// Change the dependences back so that we've created a DAG again.
@@ -2565,7 +2567,7 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
25652567

25662568
/// Return true if we can compute the amount the instruction changes
25672569
/// during each iteration. Set Delta to the amount of the change.
2568-
bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2570+
bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) const {
25692571
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
25702572
const MachineOperand *BaseOp;
25712573
int64_t Offset;
@@ -2719,7 +2721,7 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
27192721
/// potentially. A dependence is loop carried if the destination defines a value
27202722
/// that may be used or defined by the source in a subsequent iteration.
27212723
bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
2722-
bool isSucc) {
2724+
bool isSucc) const {
27232725
if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
27242726
Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
27252727
return false;
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
; REQUIRES: asserts
2+
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs\
3+
; RUN: -mcpu=pwr9 --ppc-enable-pipeliner --debug-only=pipeliner 2>&1 | FileCheck %s
4+
5+
; Test that the pipeliner doesn't overestimate the recurrence MII when evaluating circuits.
6+
; CHECK: MII = 16 MAX_II = 26 (rec=16, res=5)
7+
define dso_local void @comp_method(ptr noalias nocapture noundef readonly %0, ptr nocapture noundef writeonly %1, ptr nocapture noundef writeonly %2, i32 noundef %3, i32 noundef %4, i32 noundef %5, i32 noundef %6, i64 %v1) local_unnamed_addr {
8+
%8 = icmp sgt i32 %3, 64
9+
tail call void @llvm.assume(i1 %8)
10+
%9 = and i32 %3, 1
11+
%10 = icmp eq i32 %9, 0
12+
tail call void @llvm.assume(i1 %10)
13+
%11 = sext i32 %5 to i64
14+
%12 = sext i32 %6 to i64
15+
%13 = zext nneg i32 %3 to i64
16+
%14 = getelementptr i8, ptr %2, i64 %12
17+
br label %16
18+
19+
15:
20+
ret void
21+
22+
16:
23+
%17 = phi i64 [ 0, %7 ], [ %24, %16 ]
24+
%18 = getelementptr inbounds i8, ptr %0, i64 %17
25+
%19 = load i8, ptr %18, align 1
26+
%20 = sext i8 %19 to i64
27+
%21 = getelementptr inbounds i8, ptr %1, i64 %20
28+
store i8 2, ptr %21, align 1
29+
%22 = mul nsw i64 %17, %11
30+
%a1 = ashr i64 %22, 2
31+
%a2 = add i64 %a1, %v1
32+
%a3 = add i64 %20, %a2
33+
%a4 = mul nsw i64 %a3, 5
34+
%23 = getelementptr i8, ptr %14, i64 %a4
35+
%a5 = load i8, ptr %23, align 1
36+
%a4_truncated = trunc i64 %a4 to i8
37+
%min = call i8 @llvm.smin.i8(i8 %a5, i8 %a4_truncated)
38+
%res = mul i8 %min, %a5
39+
store i8 %res, ptr %23, align 1
40+
%24 = add nuw nsw i64 %17, 1
41+
%25 = icmp eq i64 %24, %13
42+
br i1 %25, label %15, label %16
43+
}
44+
45+
declare void @llvm.assume(i1 noundef) #1
46+
declare i8 @llvm.smin.i8(i8, i8)
47+
48+
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }

llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,8 @@ body: |
222222
; CHECK-NEXT: [[t2MLS1:%[0-9]+]]:rgpr = t2MLS [[t2SDIV1]], [[t2LDRSHi12_1]], [[t2LDRSH_PRE2]], 14 /* CC::al */, $noreg
223223
; CHECK-NEXT: [[t2UXTH2:%[0-9]+]]:rgpr = t2UXTH [[t2SDIV1]], 0, 14 /* CC::al */, $noreg
224224
; CHECK-NEXT: [[t2LDRHi12_9:%[0-9]+]]:gprnopc = t2LDRHi12 [[t2LDRSH_PRE3]], 6, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.uglygep6, align 2, !tbaa !9)
225-
; CHECK-NEXT: t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr
226225
; CHECK-NEXT: [[t2UXTH3:%[0-9]+]]:rgpr = t2UXTH [[t2MLS1]], 0, 14 /* CC::al */, $noreg
226+
; CHECK-NEXT: t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr
227227
; CHECK-NEXT: [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[PHI1]](tied-def 0)
228228
; CHECK-NEXT: t2CMPrr [[t2LDRHi12_9]], [[t2UXTH3]], 14 /* CC::al */, $noreg, implicit-def $cpsr
229229
; CHECK-NEXT: [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[t2ADDri4]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[t2ADDri4]](tied-def 0)

0 commit comments

Comments
 (0)