diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h index 7fe5581faa183..0cc862590d0c0 100644 --- a/llvm/include/llvm/CodeGen/MachinePipeliner.h +++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h @@ -197,7 +197,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { } void createAdjacencyStructure(SwingSchedulerDAG *DAG); - bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false); + bool circuit(int V, int S, NodeSetType &NodeSets, + const SwingSchedulerDAG *DAG, bool HasBackedge = false); void unblock(int U); }; @@ -260,7 +261,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI(); } - bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true); + bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, + bool isSucc = true) const; /// The distance function, which indicates that operation V of iteration I /// depends on operations U of iteration I-distance. @@ -311,7 +313,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { void computeNodeOrder(NodeSetType &NodeSets); void checkValidNodeOrder(const NodeSetType &Circuits) const; bool schedulePipeline(SMSchedule &Schedule); - bool computeDelta(MachineInstr &MI, unsigned &Delta); + bool computeDelta(MachineInstr &MI, unsigned &Delta) const; MachineInstr *findDefInLoop(Register Reg); bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos, unsigned &OffsetPos, unsigned &NewBase, @@ -339,24 +341,56 @@ class NodeSet { using iterator = SetVector::const_iterator; NodeSet() = default; - NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) { - Latency = 0; - for (const SUnit *Node : Nodes) { - DenseMap SuccSUnitLatency; - for (const SDep &Succ : Node->Succs) { - auto SuccSUnit = Succ.getSUnit(); - if (!Nodes.count(SuccSUnit)) + NodeSet(iterator S, iterator E, const SwingSchedulerDAG *DAG) + : Nodes(S, E), HasRecurrence(true) { + // Calculate the latency of this node set. + // Example to demonstrate the calculation: + // Given: N0 -> N1 -> N2 -> N0 + // Edges: + // (N0 -> N1, 3) + // (N0 -> N1, 5) + // (N1 -> N2, 2) + // (N2 -> N0, 1) + // The total latency which is a lower bound of the recurrence MII is the + // longest path from N0 back to N0 given only the edges of this node set. + // In this example, the latency is: 5 + 2 + 1 = 8. + // + // Hold a map from each SUnit in the circle to the maximum distance from the + // source node by only considering the nodes. + DenseMap SUnitToDistance; + for (auto *Node : Nodes) + SUnitToDistance[Node] = 0; + + for (unsigned I = 1, E = Nodes.size(); I <= E; ++I) { + SUnit *U = Nodes[I - 1]; + SUnit *V = Nodes[I % Nodes.size()]; + for (const SDep &Succ : U->Succs) { + SUnit *SuccSUnit = Succ.getSUnit(); + if (V != SuccSUnit) continue; - unsigned CurLatency = Succ.getLatency(); - unsigned MaxLatency = 0; - if (SuccSUnitLatency.count(SuccSUnit)) - MaxLatency = SuccSUnitLatency[SuccSUnit]; - if (CurLatency > MaxLatency) - SuccSUnitLatency[SuccSUnit] = CurLatency; + if (SUnitToDistance[U] + Succ.getLatency() > SUnitToDistance[V]) { + SUnitToDistance[V] = SUnitToDistance[U] + Succ.getLatency(); + } } - for (auto SUnitLatency : SuccSUnitLatency) - Latency += SUnitLatency.second; } + // Handle a back-edge in loop carried dependencies + SUnit *FirstNode = Nodes[0]; + SUnit *LastNode = Nodes[Nodes.size() - 1]; + + for (auto &PI : LastNode->Preds) { + // If we have an order dep that is potentially loop carried then a + // back-edge exists between the last node and the first node that isn't + // modeled in the DAG. Handle it manually by adding 1 to the distance of + // the last node. + if (PI.getSUnit() != FirstNode || PI.getKind() != SDep::Order || + !DAG->isLoopCarriedDep(LastNode, PI, false)) + continue; + SUnitToDistance[FirstNode] = + std::max(SUnitToDistance[FirstNode], SUnitToDistance[LastNode] + 1); + } + + // The latency is the distance from the source node to itself. + Latency = SUnitToDistance[Nodes.front()]; } bool insert(SUnit *SU) { return Nodes.insert(SU); } diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 416129ff837c3..34eaf211c17a3 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -1706,6 +1706,7 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure( /// Identify an elementary circuit in the dependence graph starting at the /// specified node. bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets, + const SwingSchedulerDAG *DAG, bool HasBackedge) { SUnit *SV = &SUnits[V]; bool F = false; @@ -1719,12 +1720,13 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets, continue; if (W == S) { if (!HasBackedge) - NodeSets.push_back(NodeSet(Stack.begin(), Stack.end())); + NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG)); F = true; ++NumPaths; break; - } else if (!Blocked.test(W)) { - if (circuit(W, S, NodeSets, + } + if (!Blocked.test(W)) { + if (circuit(W, S, NodeSets, DAG, Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge)) F = true; } @@ -1767,9 +1769,9 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) { Circuits Cir(SUnits, Topo); // Create the adjacency structure. Cir.createAdjacencyStructure(this); - for (int i = 0, e = SUnits.size(); i != e; ++i) { + for (int I = 0, E = SUnits.size(); I != E; ++I) { Cir.reset(); - Cir.circuit(i, i, NodeSets); + Cir.circuit(I, I, NodeSets, this); } // Change the dependences back so that we've created a DAG again. @@ -2565,7 +2567,7 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) { /// Return true if we can compute the amount the instruction changes /// during each iteration. Set Delta to the amount of the change. -bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) { +bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) const { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const MachineOperand *BaseOp; int64_t Offset; @@ -2719,7 +2721,7 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) { /// potentially. A dependence is loop carried if the destination defines a value /// that may be used or defined by the source in a subsequent iteration. bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep, - bool isSucc) { + bool isSucc) const { if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) || Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode()) return false; diff --git a/llvm/test/CodeGen/PowerPC/sms-recmii.ll b/llvm/test/CodeGen/PowerPC/sms-recmii.ll new file mode 100644 index 0000000000000..45747f787b236 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/sms-recmii.ll @@ -0,0 +1,48 @@ +; REQUIRES: asserts +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs\ +; RUN: -mcpu=pwr9 --ppc-enable-pipeliner --debug-only=pipeliner 2>&1 | FileCheck %s + +; Test that the pipeliner doesn't overestimate the recurrence MII when evaluating circuits. +; CHECK: MII = 16 MAX_II = 26 (rec=16, res=5) +define dso_local void @comp_method(ptr noalias nocapture noundef readonly %0, ptr nocapture noundef writeonly %1, ptr nocapture noundef writeonly %2, i32 noundef %3, i32 noundef %4, i32 noundef %5, i32 noundef %6, i64 %v1) local_unnamed_addr { + %8 = icmp sgt i32 %3, 64 + tail call void @llvm.assume(i1 %8) + %9 = and i32 %3, 1 + %10 = icmp eq i32 %9, 0 + tail call void @llvm.assume(i1 %10) + %11 = sext i32 %5 to i64 + %12 = sext i32 %6 to i64 + %13 = zext nneg i32 %3 to i64 + %14 = getelementptr i8, ptr %2, i64 %12 + br label %16 + +15: + ret void + +16: + %17 = phi i64 [ 0, %7 ], [ %24, %16 ] + %18 = getelementptr inbounds i8, ptr %0, i64 %17 + %19 = load i8, ptr %18, align 1 + %20 = sext i8 %19 to i64 + %21 = getelementptr inbounds i8, ptr %1, i64 %20 + store i8 2, ptr %21, align 1 + %22 = mul nsw i64 %17, %11 + %a1 = ashr i64 %22, 2 + %a2 = add i64 %a1, %v1 + %a3 = add i64 %20, %a2 + %a4 = mul nsw i64 %a3, 5 + %23 = getelementptr i8, ptr %14, i64 %a4 + %a5 = load i8, ptr %23, align 1 + %a4_truncated = trunc i64 %a4 to i8 + %min = call i8 @llvm.smin.i8(i8 %a5, i8 %a4_truncated) + %res = mul i8 %min, %a5 + store i8 %res, ptr %23, align 1 + %24 = add nuw nsw i64 %17, 1 + %25 = icmp eq i64 %24, %13 + br i1 %25, label %15, label %16 +} + +declare void @llvm.assume(i1 noundef) #1 +declare i8 @llvm.smin.i8(i8, i8) + +attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } diff --git a/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir b/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir index 08f08c41917b1..6983c6f97cc81 100644 --- a/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir +++ b/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir @@ -222,8 +222,8 @@ body: | ; CHECK-NEXT: [[t2MLS1:%[0-9]+]]:rgpr = t2MLS [[t2SDIV1]], [[t2LDRSHi12_1]], [[t2LDRSH_PRE2]], 14 /* CC::al */, $noreg ; CHECK-NEXT: [[t2UXTH2:%[0-9]+]]:rgpr = t2UXTH [[t2SDIV1]], 0, 14 /* CC::al */, $noreg ; CHECK-NEXT: [[t2LDRHi12_9:%[0-9]+]]:gprnopc = t2LDRHi12 [[t2LDRSH_PRE3]], 6, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.uglygep6, align 2, !tbaa !9) - ; CHECK-NEXT: t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK-NEXT: [[t2UXTH3:%[0-9]+]]:rgpr = t2UXTH [[t2MLS1]], 0, 14 /* CC::al */, $noreg + ; CHECK-NEXT: t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK-NEXT: [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[PHI1]](tied-def 0) ; CHECK-NEXT: t2CMPrr [[t2LDRHi12_9]], [[t2UXTH3]], 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK-NEXT: [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[t2ADDri4]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[t2ADDri4]](tied-def 0)