Skip to content

[MachinePipeliner] Make Recurrence MII More Accurate #105475

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 52 additions & 18 deletions llvm/include/llvm/CodeGen/MachinePipeliner.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
}

void createAdjacencyStructure(SwingSchedulerDAG *DAG);
bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
bool circuit(int V, int S, NodeSetType &NodeSets,
const SwingSchedulerDAG *DAG, bool HasBackedge = false);
void unblock(int U);
};

Expand Down Expand Up @@ -260,7 +261,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
}

bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep,
bool isSucc = true) const;

/// The distance function, which indicates that operation V of iteration I
/// depends on operations U of iteration I-distance.
Expand Down Expand Up @@ -311,7 +313,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
void computeNodeOrder(NodeSetType &NodeSets);
void checkValidNodeOrder(const NodeSetType &Circuits) const;
bool schedulePipeline(SMSchedule &Schedule);
bool computeDelta(MachineInstr &MI, unsigned &Delta);
bool computeDelta(MachineInstr &MI, unsigned &Delta) const;
MachineInstr *findDefInLoop(Register Reg);
bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
unsigned &OffsetPos, unsigned &NewBase,
Expand Down Expand Up @@ -339,24 +341,56 @@ class NodeSet {
using iterator = SetVector<SUnit *>::const_iterator;

NodeSet() = default;
NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
Latency = 0;
for (const SUnit *Node : Nodes) {
DenseMap<SUnit *, unsigned> SuccSUnitLatency;
for (const SDep &Succ : Node->Succs) {
auto SuccSUnit = Succ.getSUnit();
if (!Nodes.count(SuccSUnit))
NodeSet(iterator S, iterator E, const SwingSchedulerDAG *DAG)
: Nodes(S, E), HasRecurrence(true) {
// Calculate the latency of this node set.
// Example to demonstrate the calculation:
// Given: N0 -> N1 -> N2 -> N0
// Edges:
// (N0 -> N1, 3)
// (N0 -> N1, 5)
// (N1 -> N2, 2)
// (N2 -> N0, 1)
// The total latency which is a lower bound of the recurrence MII is the
// longest path from N0 back to N0 given only the edges of this node set.
// In this example, the latency is: 5 + 2 + 1 = 8.
//
// Hold a map from each SUnit in the circle to the maximum distance from the
// source node by only considering the nodes.
DenseMap<SUnit *, unsigned> SUnitToDistance;
for (auto *Node : Nodes)
SUnitToDistance[Node] = 0;

for (unsigned I = 1, E = Nodes.size(); I <= E; ++I) {
SUnit *U = Nodes[I - 1];
SUnit *V = Nodes[I % Nodes.size()];
for (const SDep &Succ : U->Succs) {
SUnit *SuccSUnit = Succ.getSUnit();
if (V != SuccSUnit)
continue;
unsigned CurLatency = Succ.getLatency();
unsigned MaxLatency = 0;
if (SuccSUnitLatency.count(SuccSUnit))
MaxLatency = SuccSUnitLatency[SuccSUnit];
if (CurLatency > MaxLatency)
SuccSUnitLatency[SuccSUnit] = CurLatency;
if (SUnitToDistance[U] + Succ.getLatency() > SUnitToDistance[V]) {
SUnitToDistance[V] = SUnitToDistance[U] + Succ.getLatency();
}
}
for (auto SUnitLatency : SuccSUnitLatency)
Latency += SUnitLatency.second;
}
// Handle a back-edge in loop carried dependencies
SUnit *FirstNode = Nodes[0];
SUnit *LastNode = Nodes[Nodes.size() - 1];

for (auto &PI : LastNode->Preds) {
// If we have an order dep that is potentially loop carried then a
// back-edge exists between the last node and the first node that isn't
// modeled in the DAG. Handle it manually by adding 1 to the distance of
// the last node.
if (PI.getSUnit() != FirstNode || PI.getKind() != SDep::Order ||
!DAG->isLoopCarriedDep(LastNode, PI, false))
continue;
SUnitToDistance[FirstNode] =
std::max(SUnitToDistance[FirstNode], SUnitToDistance[LastNode] + 1);
}

// The latency is the distance from the source node to itself.
Latency = SUnitToDistance[Nodes.front()];
}

bool insert(SUnit *SU) { return Nodes.insert(SU); }
Expand Down
16 changes: 9 additions & 7 deletions llvm/lib/CodeGen/MachinePipeliner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1706,6 +1706,7 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
/// Identify an elementary circuit in the dependence graph starting at the
/// specified node.
bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
const SwingSchedulerDAG *DAG,
bool HasBackedge) {
SUnit *SV = &SUnits[V];
bool F = false;
Expand All @@ -1719,12 +1720,13 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
continue;
if (W == S) {
if (!HasBackedge)
NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
F = true;
++NumPaths;
break;
} else if (!Blocked.test(W)) {
if (circuit(W, S, NodeSets,
}
if (!Blocked.test(W)) {
if (circuit(W, S, NodeSets, DAG,
Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
F = true;
}
Expand Down Expand Up @@ -1767,9 +1769,9 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
Circuits Cir(SUnits, Topo);
// Create the adjacency structure.
Cir.createAdjacencyStructure(this);
for (int i = 0, e = SUnits.size(); i != e; ++i) {
for (int I = 0, E = SUnits.size(); I != E; ++I) {
Cir.reset();
Cir.circuit(i, i, NodeSets);
Cir.circuit(I, I, NodeSets, this);
}

// Change the dependences back so that we've created a DAG again.
Expand Down Expand Up @@ -2565,7 +2567,7 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {

/// Return true if we can compute the amount the instruction changes
/// during each iteration. Set Delta to the amount of the change.
bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) const {
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
const MachineOperand *BaseOp;
int64_t Offset;
Expand Down Expand Up @@ -2719,7 +2721,7 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
/// potentially. A dependence is loop carried if the destination defines a value
/// that may be used or defined by the source in a subsequent iteration.
bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
bool isSucc) {
bool isSucc) const {
if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
return false;
Expand Down
48 changes: 48 additions & 0 deletions llvm/test/CodeGen/PowerPC/sms-recmii.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs\
; RUN: -mcpu=pwr9 --ppc-enable-pipeliner --debug-only=pipeliner 2>&1 | FileCheck %s

; Test that the pipeliner doesn't overestimate the recurrence MII when evaluating circuits.
; CHECK: MII = 16 MAX_II = 26 (rec=16, res=5)
define dso_local void @comp_method(ptr noalias nocapture noundef readonly %0, ptr nocapture noundef writeonly %1, ptr nocapture noundef writeonly %2, i32 noundef %3, i32 noundef %4, i32 noundef %5, i32 noundef %6, i64 %v1) local_unnamed_addr {
%8 = icmp sgt i32 %3, 64
tail call void @llvm.assume(i1 %8)
%9 = and i32 %3, 1
%10 = icmp eq i32 %9, 0
tail call void @llvm.assume(i1 %10)
%11 = sext i32 %5 to i64
%12 = sext i32 %6 to i64
%13 = zext nneg i32 %3 to i64
%14 = getelementptr i8, ptr %2, i64 %12
br label %16

15:
ret void

16:
%17 = phi i64 [ 0, %7 ], [ %24, %16 ]
%18 = getelementptr inbounds i8, ptr %0, i64 %17
%19 = load i8, ptr %18, align 1
%20 = sext i8 %19 to i64
%21 = getelementptr inbounds i8, ptr %1, i64 %20
store i8 2, ptr %21, align 1
%22 = mul nsw i64 %17, %11
%a1 = ashr i64 %22, 2
%a2 = add i64 %a1, %v1
%a3 = add i64 %20, %a2
%a4 = mul nsw i64 %a3, 5
%23 = getelementptr i8, ptr %14, i64 %a4
%a5 = load i8, ptr %23, align 1
%a4_truncated = trunc i64 %a4 to i8
%min = call i8 @llvm.smin.i8(i8 %a5, i8 %a4_truncated)
%res = mul i8 %min, %a5
store i8 %res, ptr %23, align 1
%24 = add nuw nsw i64 %17, 1
%25 = icmp eq i64 %24, %13
br i1 %25, label %15, label %16
}

declare void @llvm.assume(i1 noundef) #1
declare i8 @llvm.smin.i8(i8, i8)

attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,8 @@ body: |
; CHECK-NEXT: [[t2MLS1:%[0-9]+]]:rgpr = t2MLS [[t2SDIV1]], [[t2LDRSHi12_1]], [[t2LDRSH_PRE2]], 14 /* CC::al */, $noreg
; CHECK-NEXT: [[t2UXTH2:%[0-9]+]]:rgpr = t2UXTH [[t2SDIV1]], 0, 14 /* CC::al */, $noreg
; CHECK-NEXT: [[t2LDRHi12_9:%[0-9]+]]:gprnopc = t2LDRHi12 [[t2LDRSH_PRE3]], 6, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.uglygep6, align 2, !tbaa !9)
; CHECK-NEXT: t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK-NEXT: [[t2UXTH3:%[0-9]+]]:rgpr = t2UXTH [[t2MLS1]], 0, 14 /* CC::al */, $noreg
; CHECK-NEXT: t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK-NEXT: [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[PHI1]](tied-def 0)
; CHECK-NEXT: t2CMPrr [[t2LDRHi12_9]], [[t2UXTH3]], 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK-NEXT: [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[t2ADDri4]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[t2ADDri4]](tied-def 0)
Expand Down
Loading